Merge "Remove duplicated variables in EXT_INTER" into nextgenv2
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 597d5b2..ee46820 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -111,6 +111,9 @@
 
     add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
     specialize qw/av1_iht16x16_256_add sse2 avx2/;
+
+    add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/av1_iht32x32_1024_add/;
   }
 } else {
   # Force C versions if CONFIG_EMULATE_HARDWARE is 1
@@ -141,6 +144,10 @@
 
     add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
     specialize qw/av1_iht16x16_256_add/;
+
+    add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/av1_iht32x32_1024_add/;
+
   } else {
     add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht4x4_16_add sse2 neon dspr2/;
@@ -169,6 +176,9 @@
     add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
     specialize qw/av1_iht16x16_256_add sse2 avx2 dspr2/;
 
+    add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/av1_iht32x32_1024_add/;
+
     if (aom_config("CONFIG_EXT_TX") ne "yes") {
       specialize qw/av1_iht4x4_16_add msa/;
       specialize qw/av1_iht8x8_64_add msa/;
@@ -176,6 +186,13 @@
     }
   }
 }
+add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+specialize qw/av1_iht32x32_1024_add/;
+
+if (aom_config("CONFIG_TX64X64") eq "yes") {
+  add_proto qw/void av1_iht64x64_4096_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+  specialize qw/av1_iht64x64_4096_add/;
+}
 
 if (aom_config("CONFIG_NEW_QUANT") eq "yes") {
   add_proto qw/void quantize_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
diff --git a/av1/common/enums.h b/av1/common/enums.h
index 2ec83ec..6c0eb3d 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -134,24 +134,32 @@
 #if CONFIG_CB4X4
   TX_2X2,  // 2x2 transform
 #endif
-  TX_4X4,                   // 4x4 transform
-  TX_8X8,                   // 8x8 transform
-  TX_16X16,                 // 16x16 transform
-  TX_32X32,                 // 32x32 transform
-  TX_4X8,                   // 4x8 transform
-  TX_8X4,                   // 8x4 transform
-  TX_8X16,                  // 8x16 transform
-  TX_16X8,                  // 16x8 transform
-  TX_16X32,                 // 16x32 transform
-  TX_32X16,                 // 32x16 transform
-  TX_SIZES_ALL,             // Includes rectangular transforms
-  TX_SIZES = TX_32X32 + 1,  // Does NOT include rectangular transforms
-  TX_INVALID = 255          // Invalid transform size
+  TX_4X4,    // 4x4 transform
+  TX_8X8,    // 8x8 transform
+  TX_16X16,  // 16x16 transform
+  TX_32X32,  // 32x32 transform
+#if CONFIG_TX64X64
+  TX_64X64,           // 64x64 transform
+#endif                // CONFIG_TX64X64
+  TX_4X8,             // 4x8 transform
+  TX_8X4,             // 8x4 transform
+  TX_8X16,            // 8x16 transform
+  TX_16X8,            // 16x8 transform
+  TX_16X32,           // 16x32 transform
+  TX_32X16,           // 32x16 transform
+#if 0                 // CONFIG_TX64X64
+  // TODO(debargha): To be enabled later
+  TX_32X64,                 // 32x64 transform
+  TX_64X32,                 // 64x32 transform
+#endif                // CONFIG_TX64X64
+  TX_SIZES_ALL,       // Includes rectangular transforms
+  TX_SIZES = TX_4X8,  // Does NOT include rectangular transforms
+  TX_INVALID = 255    // Invalid transform size
 } TX_SIZE;
 
 #define MAX_TX_DEPTH (TX_32X32 - TX_4X4)
 
-#define MAX_TX_SIZE_LOG2 5
+#define MAX_TX_SIZE_LOG2 (5 + CONFIG_TX64X64)
 #define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
 #define MIN_TX_SIZE_LOG2 2
 #define MIN_TX_SIZE (1 << MIN_TX_SIZE_LOG2)
diff --git a/av1/common/idct.c b/av1/common/idct.c
index 223c577..2663d2d 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -23,14 +23,14 @@
 int get_tx_scale(const MACROBLOCKD *const xd, const TX_TYPE tx_type,
                  const TX_SIZE tx_size) {
   (void)tx_type;
-#if CONFIG_AOM_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    return txsize_sqr_up_map[tx_size] == TX_32X32;
-  }
-#else
   (void)xd;
-#endif
-  return txsize_sqr_up_map[tx_size] == TX_32X32;
+  if (txsize_sqr_up_map[tx_size] == TX_32X32) return 1;
+#if CONFIG_TX64X64
+  else if (txsize_sqr_up_map[tx_size] == TX_64X64)
+    return 2;
+#endif  // CONFIG_TX64X64
+  else
+    return 0;
 }
 
 // NOTE: The implementation of all inverses need to be aware of the fact
@@ -58,6 +58,14 @@
   int i;
   for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
 }
+
+#if CONFIG_TX64X64
+static void iidtx64_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 64; ++i)
+    output[i] = (tran_low_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_EXT_TX
 
 // For use in lieu of ADST
@@ -94,12 +102,6 @@
   for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
 }
 
-static void iidtx64_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 64; ++i)
-    output[i] = (tran_low_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
-}
-
 // For use in lieu of ADST
 static void ihalfright64_c(const tran_low_t *input, tran_low_t *output) {
   int i;
@@ -174,7 +176,10 @@
     output[i] =
         HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[i] * 4 * Sqrt2), bd);
 }
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_EXT_TX
 
+#if CONFIG_TX64X64
 // For use in lieu of ADST
 static void highbd_ihalfright64_c(const tran_low_t *input, tran_low_t *output,
                                   int bd) {
@@ -215,7 +220,6 @@
   for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
 }
 #endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 // Inverse identity transform and add.
@@ -223,7 +227,7 @@
 static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                            int bs, int tx_type) {
   int r, c;
-  const int shift = bs < 32 ? 3 : 2;
+  const int shift = bs < 32 ? 3 : (bs < 64 ? 2 : 1);
   if (tx_type == IDTX) {
     for (r = 0; r < bs; ++r) {
       for (c = 0; c < bs; ++c)
@@ -929,6 +933,7 @@
     }
   }
 }
+#endif  // CONFIG_EXT_TX
 
 #if CONFIG_TX64X64
 void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride,
@@ -938,6 +943,7 @@
     { ihalfright64_c, idct64_row_c },    // ADST_DCT
     { idct64_col_c, ihalfright64_c },    // DCT_ADST
     { ihalfright64_c, ihalfright64_c },  // ADST_ADST
+#if CONFIG_EXT_TX
     { ihalfright64_c, idct64_row_c },    // FLIPADST_DCT
     { idct64_col_c, ihalfright64_c },    // DCT_FLIPADST
     { ihalfright64_c, ihalfright64_c },  // FLIPADST_FLIPADST
@@ -950,6 +956,7 @@
     { iidtx64_c, ihalfright64_c },       // H_ADST
     { ihalfright64_c, iidtx64_c },       // V_FLIPADST
     { iidtx64_c, ihalfright64_c },       // H_FLIPADST
+#endif                                   // CONFIG_EXT_TX
   };
 
   int i, j;
@@ -979,7 +986,9 @@
     IHT_64[tx_type].cols(out[i], out[i]);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 64, 64);
+#endif  // CONFIG_EXT_TX
 
   // Sum with the destination
   for (i = 0; i < 64; ++i) {
@@ -991,7 +1000,6 @@
   }
 }
 #endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
 
 // idct
 void av1_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
@@ -1056,6 +1064,14 @@
     aom_idct32x32_1024_add(input, dest, stride);
 }
 
+#if CONFIG_TX64X64
+void av1_idct64x64_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob) {
+  (void)eob;
+  av1_iht64x64_4096_add(input, dest, stride, DCT_DCT);
+}
+#endif  // CONFIG_TX64X64
+
 void av1_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, int stride,
                           int eob, TX_TYPE tx_type, int lossless) {
   if (lossless) {
@@ -1206,6 +1222,35 @@
   }
 }
 
+#if CONFIG_TX64X64
+void av1_inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, TX_TYPE tx_type) {
+  switch (tx_type) {
+    case DCT_DCT: av1_idct64x64_add(input, dest, stride, eob); break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      av1_iht64x64_4096_add_c(input, dest, stride, tx_type);
+      break;
+    case IDTX: inv_idtx_add_c(input, dest, stride, 64, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_AOM_HIGHBITDEPTH
 void av1_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int tx_type, int bd) {
@@ -1835,6 +1880,7 @@
     }
   }
 }
+#endif  // CONFIG_EXT_TX
 
 #if CONFIG_TX64X64
 void av1_highbd_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1844,6 +1890,7 @@
     { highbd_ihalfright64_c, highbd_idct64_row_c },    // ADST_DCT
     { highbd_idct64_col_c, highbd_ihalfright64_c },    // DCT_ADST
     { highbd_ihalfright64_c, highbd_ihalfright64_c },  // ADST_ADST
+#if CONFIG_EXT_TX
     { highbd_ihalfright64_c, highbd_idct64_row_c },    // FLIPADST_DCT
     { highbd_idct64_col_c, highbd_ihalfright64_c },    // DCT_FLIPADST
     { highbd_ihalfright64_c, highbd_ihalfright64_c },  // FLIPADST_FLIPADST
@@ -1856,6 +1903,7 @@
     { highbd_iidtx64_c, highbd_ihalfright64_c },       // H_ADST
     { highbd_ihalfright64_c, highbd_iidtx64_c },       // V_FLIPADST
     { highbd_iidtx64_c, highbd_ihalfright64_c },       // H_FLIPADST
+#endif                                                 // CONFIG_EXT_TX
   };
 
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
@@ -1887,7 +1935,9 @@
     HIGH_IHT_64[tx_type].cols(out[i], out[i], bd);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 64, 64);
+#endif  // CONFIG_EXT_TX
 
   // Sum with the destination
   for (i = 0; i < 64; ++i) {
@@ -1900,7 +1950,6 @@
   }
 }
 #endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
 
 // idct
 void av1_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
@@ -2155,6 +2204,42 @@
     default: assert(0); break;
   }
 }
+
+#if CONFIG_TX64X64
+void av1_highbd_inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest,
+                                   int stride, int eob, int bd,
+                                   TX_TYPE tx_type) {
+  (void)eob;
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_inv_txfm2d_add_64x64(input, CONVERT_TO_SHORTPTR(dest), stride,
+                               DCT_DCT, bd);
+      break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      av1_highbd_iht64x64_4096_add_c(input, dest, stride, tx_type, bd);
+      break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 64, tx_type, bd);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 void inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
@@ -2165,6 +2250,11 @@
   const int lossless = inv_txfm_param->lossless;
 
   switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      av1_inv_txfm_add_64x64(input, dest, stride, eob, tx_type);
+      break;
+#endif  // CONFIG_TX64X64
     case TX_32X32:
       av1_inv_txfm_add_32x32(input, dest, stride, eob, tx_type);
       break;
@@ -2206,6 +2296,11 @@
   const int lossless = inv_txfm_param->lossless;
 
   switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      av1_highbd_inv_txfm_add_64x64(input, dest, stride, eob, bd, tx_type);
+      break;
+#endif  // CONFIG_TX64X64
     case TX_32X32:
       av1_highbd_inv_txfm_add_32x32(input, dest, stride, eob, bd, tx_type);
       break;
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index 8f0a212..f1c4239 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -369,6 +369,7 @@
   const int tile_height = rst->tile_height >> rst->subsampling_y;
   int i, j, subtile_idx;
   int h_start, h_end, v_start, v_end;
+  const int shift = bit_depth - 8;
 
   for (subtile_idx = 0; subtile_idx < BILATERAL_SUBTILES; ++subtile_idx) {
     uint16_t *data_p, *tmpdata_p;
@@ -398,7 +399,7 @@
         for (y = -RESTORATION_HALFWIN; y <= RESTORATION_HALFWIN; ++y) {
           for (x = -RESTORATION_HALFWIN; x <= RESTORATION_HALFWIN; ++x) {
             wt = (int)wx_lut[y + RESTORATION_HALFWIN][x + RESTORATION_HALFWIN] *
-                 (int)wr_lut_[data_p2[x] - data_p[j]];
+                 (int)wr_lut_[(data_p2[x] >> shift) - (data_p[j] >> shift)];
             wtsum += (int64_t)wt;
             flsum += (int64_t)wt * data_p2[x];
           }
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index f73e777..89534de 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -215,7 +215,7 @@
   } else if (x == (1 << WARPEDPIXEL_PREC_BITS)) {
     return p[1];
   } else {
-    const int64_t v1 = x * x * x * (3 * (p[0] - p[1]) + p[2] - p[-1]);
+    const int64_t v1 = (int64_t)x * x * x * (3 * (p[0] - p[1]) + p[2] - p[-1]);
     const int64_t v2 = x * x * (2 * p[-1] - 5 * p[0] + 4 * p[1] - p[2]);
     const int64_t v3 = x * (p[1] - p[-1]);
     const int64_t v4 = 2 * p[0];
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 0c155e0..8b0de2f 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -6180,11 +6180,12 @@
     ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
     const struct macroblockd_plane *const pd = &xd->plane[plane];
     int coeff_ctx = 1;
+    RD_STATS this_rd_stats;
 
-    this_rate = 0;
-    this_dist = 0;
-    pnsse = 0;
-    pnskip = 1;
+    this_rd_stats.rate = 0;
+    this_rd_stats.dist = 0;
+    this_rd_stats.sse = 0;
+    this_rd_stats.skip = 1;
 
     tx_size = max_txsize_lookup[bsize];
     tx_size =
@@ -6194,8 +6195,13 @@
 
     av1_subtract_plane(x, bsize, plane);
     av1_tx_block_rd_b(cpi, x, tx_size, 0, 0, plane, 0,
-                      get_plane_block_size(bsize, pd), coeff_ctx, &this_rate,
-                      &this_dist, &pnsse, &pnskip);
+                      get_plane_block_size(bsize, pd), coeff_ctx,
+                      &this_rd_stats);
+
+    this_rate = this_rd_stats.rate;
+    this_dist = this_rd_stats.dist;
+    pnsse = this_rd_stats.sse;
+    pnskip = this_rd_stats.skip;
 #else
     tx_size = max_txsize_lookup[bsize];
     tx_size =
@@ -6223,7 +6229,9 @@
     ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
     const struct macroblockd_plane *const pd = &xd->plane[0];
     int coeff_ctx = 1;
+    RD_STATS this_rd_stats;
 #endif  // CONFIG_VAR_TX
+
 #if CONFIG_EXT_TX
     if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
 #else
@@ -6232,15 +6240,20 @@
     mbmi->tx_type = tx_type;
 
 #if CONFIG_VAR_TX
-    this_rate = 0;
-    this_dist = 0;
-    pnsse = 0;
-    pnskip = 1;
+    this_rd_stats.rate = 0;
+    this_rd_stats.dist = 0;
+    this_rd_stats.sse = 0;
+    this_rd_stats.skip = 1;
 
     av1_get_entropy_contexts(bsize, tx_size, pd, ctxa, ctxl);
     coeff_ctx = combine_entropy_contexts(ctxa[0], ctxl[0]);
-    av1_tx_block_rd_b(cpi, x, tx_size, 0, 0, 0, 0, bsize, coeff_ctx, &this_rate,
-                      &this_dist, &pnsse, &pnskip);
+    av1_tx_block_rd_b(cpi, x, tx_size, 0, 0, 0, 0, bsize, coeff_ctx,
+                      &this_rd_stats);
+
+    this_rate = this_rd_stats.rate;
+    this_dist = this_rd_stats.dist;
+    pnsse = this_rd_stats.sse;
+    pnskip = this_rd_stats.skip;
 #else
     av1_txfm_rd_in_plane_supertx(x, cpi, &this_rate, &this_dist, &pnskip,
                                  &pnsse, INT64_MAX, 0, bsize, tx_size, 0);
diff --git a/av1/encoder/global_motion.c b/av1/encoder/global_motion.c
index 5d88dbf..d8abea9 100644
--- a/av1/encoder/global_motion.c
+++ b/av1/encoder/global_motion.c
@@ -25,7 +25,7 @@
 #define MAX_CORNERS 4096
 #define MIN_INLIER_PROB 0.1
 
-INLINE RansacFunc get_ransac_type(TransformationType type) {
+static INLINE RansacFunc get_ransac_type(TransformationType type) {
   switch (type) {
     case HOMOGRAPHY: return ransac_homography;
     case AFFINE: return ransac_affine;
diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index ff03516..a88c884 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -24,6 +24,24 @@
     av1_fht32x32(src, dst, src_stride, DCT_DCT);
 }
 
+#if CONFIG_TX64X64
+static INLINE void fdct64x64(const int16_t *src, tran_low_t *dst,
+                             int src_stride) {
+  av1_fht64x64(src, dst, src_stride, DCT_DCT);
+}
+
+static INLINE void fdct64x64_1(const int16_t *src, tran_low_t *dst,
+                               int src_stride) {
+  int i, j;
+  int32_t sum = 0;
+  memset(dst, 0, sizeof(*dst) * 4096);
+  for (i = 0; i < 64; ++i)
+    for (j = 0; j < 64; ++j) sum += src[i * src_stride + j];
+  // Note: this scaling makes the transform 2 times unitary
+  dst[0] = ROUND_POWER_OF_TWO_SIGNED(sum, 5);
+}
+#endif  // CONFIG_TX64X64
+
 static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
                          int diff_stride, TX_TYPE tx_type, int lossless) {
   if (lossless) {
@@ -192,6 +210,41 @@
   }
 }
 
+#if CONFIG_TX64X64
+static void fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
+                           int diff_stride, TX_TYPE tx_type,
+                           FWD_TXFM_OPT fwd_txfm_opt) {
+  switch (tx_type) {
+    case DCT_DCT:
+      if (fwd_txfm_opt == FWD_TXFM_OPT_NORMAL)
+        fdct64x64(src_diff, coeff, diff_stride);
+      else  // FWD_TXFM_OPT_DC
+        fdct64x64_1(src_diff, coeff, diff_stride);
+      break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      av1_fht64x64(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST: av1_fht32x32(src_diff, coeff, diff_stride, tx_type); break;
+    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_AOM_HIGHBITDEPTH
 static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TX_TYPE tx_type, int lossless,
@@ -379,6 +432,40 @@
     default: assert(0); break;
   }
 }
+
+#if CONFIG_TX64X64
+static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TX_TYPE tx_type,
+                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  (void)bd;
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_highbd_fht64x64_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      av1_highbd_fht64x64_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 void fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
@@ -389,6 +476,11 @@
   const int rd_transform = fwd_txfm_param->rd_transform;
   const int lossless = fwd_txfm_param->lossless;
   switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      fwd_txfm_64x64(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+#endif  // CONFIG_TX64X64
     case TX_32X32:
       fwd_txfm_32x32(rd_transform, src_diff, coeff, diff_stride, tx_type,
                      fwd_txfm_opt);
@@ -434,6 +526,12 @@
   const int lossless = fwd_txfm_param->lossless;
   const int bd = fwd_txfm_param->bd;
   switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                            bd);
+      break;
+#endif  // CONFIG_TX64X64
     case TX_32X32:
       highbd_fwd_txfm_32x32(rd_transform, src_diff, coeff, diff_stride, tx_type,
                             fwd_txfm_opt, bd);
diff --git a/av1/encoder/ransac.c b/av1/encoder/ransac.c
index 714d567..2699c4f 100644
--- a/av1/encoder/ransac.c
+++ b/av1/encoder/ransac.c
@@ -81,10 +81,12 @@
                                              const int stride_points,
                                              const int stride_proj) {
   int i;
-  double x, y, Z;
+  double x, y, Z, Z_inv;
   for (i = 0; i < n; ++i) {
     x = *(points++), y = *(points++);
-    Z = 1. / (mat[7] * x + mat[6] * y + 1);
+    Z_inv = mat[7] * x + mat[6] * y + 1;
+    assert(fabs(Z_inv) > 0.00001);
+    Z = 1. / Z_inv;
     *(proj++) = (mat[1] * x + mat[0] * y + mat[3]) * Z;
     *(proj++) = (mat[2] * x + mat[4] * y + mat[4]) * Z;
     points += stride_points - 2;
@@ -155,7 +157,7 @@
   double T1[9], T2[9];
 
   *number_of_inliers = 0;
-  if (npoints < minpts * MINPTS_MULTIPLIER) {
+  if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) {
     printf("Cannot find motion with %d matches\n", npoints);
     return 1;
   }
@@ -245,11 +247,15 @@
       }
     }
 
-    if (num_inliers >= max_inliers) {
-      double mean_distance = sum_distance / ((double)num_inliers);
-      double variance = sum_distance_squared / ((double)num_inliers - 1.0) -
-                        mean_distance * mean_distance * ((double)num_inliers) /
-                            ((double)num_inliers - 1.0);
+    if (num_inliers >= max_inliers && num_inliers > 1) {
+      int temp;
+      double fracinliers, pNoOutliers, mean_distance, variance;
+
+      assert(num_inliers > 1);
+      mean_distance = sum_distance / ((double)num_inliers);
+      variance = sum_distance_squared / ((double)num_inliers - 1.0) -
+                 mean_distance * mean_distance * ((double)num_inliers) /
+                     ((double)num_inliers - 1.0);
       if ((num_inliers > max_inliers) ||
           (num_inliers == max_inliers && variance < best_variance)) {
         best_variance = variance;
@@ -262,16 +268,15 @@
         memcpy(best_inlier_mask, inlier_mask,
                npoints * sizeof(*best_inlier_mask));
 
-        if (num_inliers > 0) {
-          double fracinliers = (double)num_inliers / (double)npoints;
-          double pNoOutliers = 1 - pow(fracinliers, minpts);
-          int temp;
-          pNoOutliers = fmax(EPS, pNoOutliers);
-          pNoOutliers = fmin(1 - EPS, pNoOutliers);
-          temp = (int)(log(1.0 - PROBABILITY_REQUIRED) / log(pNoOutliers));
-          if (temp > 0 && temp < N) {
-            N = AOMMAX(temp, MIN_TRIALS);
-          }
+        assert(npoints > 0);
+        fracinliers = (double)num_inliers / (double)npoints;
+        pNoOutliers = 1 - pow(fracinliers, minpts);
+        pNoOutliers = fmax(EPS, pNoOutliers);
+        pNoOutliers = fmin(1 - EPS, pNoOutliers);
+        assert(fabs(1.0 - pNoOutliers) > 0.00001);
+        temp = (int)(log(1.0 - PROBABILITY_REQUIRED) / log(pNoOutliers));
+        if (temp > 0 && temp < N) {
+          N = AOMMAX(temp, MIN_TRIALS);
         }
       }
     }
@@ -356,6 +361,7 @@
     // normalize so that H33 = 1
     int i;
     const double m = 1.0 / best_params[8];
+    assert(fabs(best_params[8]) > 0.00001);
     for (i = 0; i < 8; ++i) best_params[i] *= m;
     best_params[8] = 1.0;
   }
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index a3929ef..6a750d9 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -2906,8 +2906,7 @@
 #if CONFIG_VAR_TX
 void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
                        int blk_row, int blk_col, int plane, int block,
-                       int plane_bsize, int coeff_ctx, int *rate, int64_t *dist,
-                       int64_t *bsse, int *skip) {
+                       int plane_bsize, int coeff_ctx, RD_STATS *rd_stats) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
@@ -2994,7 +2993,7 @@
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
     tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
-  *bsse += tmp * 16;
+  rd_stats->sse += tmp * 16;
 
   if (p->eobs[block] > 0) {
     INV_TXFM_PARAM inv_txfm_param;
@@ -3035,10 +3034,10 @@
       tmp = this_dist;
     }
   }
-  *dist += tmp * 16;
-  *rate += av1_cost_coeffs(cm, x, plane, block, coeff_ctx, tx_size,
-                           scan_order->scan, scan_order->neighbors, 0);
-  *skip &= (p->eobs[block] == 0);
+  rd_stats->dist += tmp * 16;
+  rd_stats->rate += av1_cost_coeffs(cm, x, plane, block, coeff_ctx, tx_size,
+                                    scan_order->scan, scan_order->neighbors, 0);
+  rd_stats->skip &= (p->eobs[block] == 0);
 }
 
 static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
@@ -3046,8 +3045,8 @@
                             int depth, BLOCK_SIZE plane_bsize,
                             ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl,
                             TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
-                            int *rate, int64_t *dist, int64_t *bsse, int *skip,
-                            int64_t ref_best_rd, int *is_cost_valid) {
+                            RD_STATS *rd_stats, int64_t ref_best_rd,
+                            int *is_cost_valid) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[plane];
@@ -3086,10 +3085,10 @@
 
   coeff_ctx = get_entropy_context(tx_size, pta, ptl);
 
-  *rate = 0;
-  *dist = 0;
-  *bsse = 0;
-  *skip = 1;
+  rd_stats->rate = 0;
+  rd_stats->dist = 0;
+  rd_stats->sse = 0;
+  rd_stats->skip = 1;
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
@@ -3099,25 +3098,26 @@
   if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
     inter_tx_size[0][0] = tx_size;
     av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
-                      plane_bsize, coeff_ctx, rate, dist, bsse, skip);
+                      plane_bsize, coeff_ctx, rd_stats);
 
-    if ((RDCOST(x->rdmult, x->rddiv, *rate, *dist) >=
-             RDCOST(x->rdmult, x->rddiv, zero_blk_rate, *bsse) ||
-         *skip == 1) &&
+    if ((RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist) >=
+             RDCOST(x->rdmult, x->rddiv, zero_blk_rate, rd_stats->sse) ||
+         rd_stats->skip == 1) &&
         !xd->lossless[mbmi->segment_id]) {
-      *rate = zero_blk_rate;
-      *dist = *bsse;
-      *skip = 1;
+      rd_stats->rate = zero_blk_rate;
+      rd_stats->dist = rd_stats->sse;
+      rd_stats->skip = 1;
       x->blk_skip[plane][blk_row * bw + blk_col] = 1;
       p->eobs[block] = 0;
     } else {
       x->blk_skip[plane][blk_row * bw + blk_col] = 0;
-      *skip = 0;
+      rd_stats->skip = 0;
     }
 
     if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
-      *rate += av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
-    this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *dist);
+      rd_stats->rate +=
+          av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
+    this_rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist);
     tmp_eob = p->eobs[block];
   }
 
@@ -3125,10 +3125,7 @@
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     const int bsl = tx_size_wide_unit[sub_txs];
     int sub_step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
-    int this_rate;
-    int64_t this_dist;
-    int64_t this_bsse;
-    int this_skip;
+    RD_STATS this_rd_stats;
     int this_cost_valid = 1;
     int64_t tmp_rd = 0;
 
@@ -3141,12 +3138,12 @@
       int offsetc = (i & 0x01) * bsl;
       select_tx_block(cpi, x, blk_row + offsetr, blk_col + offsetc, plane,
                       block + i * sub_step, sub_txs, depth + 1, plane_bsize, ta,
-                      tl, tx_above, tx_left, &this_rate, &this_dist, &this_bsse,
-                      &this_skip, ref_best_rd - tmp_rd, &this_cost_valid);
-      sum_rate += this_rate;
-      sum_dist += this_dist;
-      sum_bsse += this_bsse;
-      all_skip &= this_skip;
+                      tl, tx_above, tx_left, &this_rd_stats,
+                      ref_best_rd - tmp_rd, &this_cost_valid);
+      sum_rate += this_rd_stats.rate;
+      sum_dist += this_rd_stats.dist;
+      sum_bsse += this_rd_stats.sse;
+      all_skip &= this_rd_stats.skip;
       tmp_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
       if (this_rd < tmp_rd) break;
     }
@@ -3165,29 +3162,29 @@
         inter_tx_size[idy][idx] = tx_size;
     mbmi->tx_size = tx_size;
     if (this_rd == INT64_MAX) *is_cost_valid = 0;
-    x->blk_skip[plane][blk_row * bw + blk_col] = *skip;
+    x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip;
   } else {
-    *rate = sum_rate;
-    *dist = sum_dist;
-    *bsse = sum_bsse;
-    *skip = all_skip;
+    rd_stats->rate = sum_rate;
+    rd_stats->dist = sum_dist;
+    rd_stats->sse = sum_bsse;
+    rd_stats->skip = all_skip;
     if (sum_rd == INT64_MAX) *is_cost_valid = 0;
   }
 }
 
-static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int *rate,
-                            int64_t *distortion, int *skippable, int64_t *sse,
-                            BLOCK_SIZE bsize, int64_t ref_best_rd) {
+static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                            RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                            int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
   int is_cost_valid = 1;
   int64_t this_rd = 0;
 
   if (ref_best_rd < 0) is_cost_valid = 0;
 
-  *rate = 0;
-  *distortion = 0;
-  *sse = 0;
-  *skippable = 1;
+  rd_stats->rate = 0;
+  rd_stats->dist = 0;
+  rd_stats->sse = 0;
+  rd_stats->skip = 1;
 
   if (is_cost_valid) {
     const struct macroblockd_plane *const pd = &xd->plane[0];
@@ -3205,8 +3202,11 @@
     TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
     TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
 
-    int pnrate = 0, pnskip = 1;
-    int64_t pndist = 0, pnsse = 0;
+    RD_STATS pn_rd_stats;
+    pn_rd_stats.rate = 0;
+    pn_rd_stats.skip = 1;
+    pn_rd_stats.dist = 0;
+    pn_rd_stats.sse = 0;
 
     av1_get_entropy_contexts(bsize, TX_4X4, pd, ctxa, ctxl);
     memcpy(tx_above, xd->above_txfm_context,
@@ -3218,35 +3218,35 @@
       for (idx = 0; idx < mi_width; idx += bw) {
         select_tx_block(cpi, x, idy, idx, 0, block, max_tx_size,
                         mi_height != mi_width, plane_bsize, ctxa, ctxl,
-                        tx_above, tx_left, &pnrate, &pndist, &pnsse, &pnskip,
-                        ref_best_rd - this_rd, &is_cost_valid);
-        *rate += pnrate;
-        *distortion += pndist;
-        *sse += pnsse;
-        *skippable &= pnskip;
-        this_rd += AOMMIN(RDCOST(x->rdmult, x->rddiv, pnrate, pndist),
-                          RDCOST(x->rdmult, x->rddiv, 0, pnsse));
+                        tx_above, tx_left, &pn_rd_stats, ref_best_rd - this_rd,
+                        &is_cost_valid);
+        rd_stats->rate += pn_rd_stats.rate;
+        rd_stats->dist += pn_rd_stats.dist;
+        rd_stats->sse += pn_rd_stats.sse;
+        rd_stats->skip &= pn_rd_stats.skip;
+        this_rd += AOMMIN(
+            RDCOST(x->rdmult, x->rddiv, pn_rd_stats.rate, pn_rd_stats.dist),
+            RDCOST(x->rdmult, x->rddiv, 0, pn_rd_stats.sse));
         block += step;
       }
     }
   }
 
-  this_rd = AOMMIN(RDCOST(x->rdmult, x->rddiv, *rate, *distortion),
-                   RDCOST(x->rdmult, x->rddiv, 0, *sse));
+  this_rd = AOMMIN(RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist),
+                   RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse));
   if (this_rd > ref_best_rd) is_cost_valid = 0;
 
   if (!is_cost_valid) {
     // reset cost value
-    *rate = INT_MAX;
-    *distortion = INT64_MAX;
-    *sse = INT64_MAX;
-    *skippable = 0;
+    rd_stats->rate = INT_MAX;
+    rd_stats->dist = INT64_MAX;
+    rd_stats->sse = INT64_MAX;
+    rd_stats->skip = 0;
   }
 }
 
 static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
-                                       int *rate, int64_t *dist, int *skippable,
-                                       int64_t *sse, BLOCK_SIZE bsize,
+                                       RD_STATS *rd_stats, BLOCK_SIZE bsize,
                                        int64_t ref_best_rd, TX_TYPE tx_type) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -3258,55 +3258,52 @@
   int64_t rd;
 
   mbmi->tx_type = tx_type;
-  inter_block_yrd(cpi, x, rate, dist, skippable, sse, bsize, ref_best_rd);
+  inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
   if (is_rect_tx_allowed(xd, mbmi)) {
-    int rate_rect_tx, skippable_rect_tx = 0;
-    int64_t dist_rect_tx, sse_rect_tx, rd_rect_tx;
+    RD_STATS rect_rd_stats;
+    int64_t rd_rect_tx;
     int tx_size_cat = inter_tx_size_cat_lookup[bsize];
     TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
     TX_SIZE var_tx_size = mbmi->tx_size;
 
-    txfm_rd_in_plane(x, cpi, &rate_rect_tx, &dist_rect_tx, &skippable_rect_tx,
-                     &sse_rect_tx, ref_best_rd, 0, bsize, tx_size,
-                     cpi->sf.use_fast_coef_costing);
+    txfm_rd_in_plane(x, cpi, &rect_rd_stats.rate, &rect_rd_stats.dist,
+                     &rect_rd_stats.skip, &rect_rd_stats.sse, ref_best_rd, 0,
+                     bsize, tx_size, cpi->sf.use_fast_coef_costing);
 
-    if (*rate != INT_MAX) {
-      *rate += av1_cost_bit(cm->fc->rect_tx_prob[tx_size_cat], 0);
-      if (*skippable) {
-        rd = RDCOST(x->rdmult, x->rddiv, s1, *sse);
+    if (rd_stats->rate != INT_MAX) {
+      rd_stats->rate += av1_cost_bit(cm->fc->rect_tx_prob[tx_size_cat], 0);
+      if (rd_stats->skip) {
+        rd = RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse);
       } else {
-        rd = RDCOST(x->rdmult, x->rddiv, *rate + s0, *dist);
+        rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate + s0, rd_stats->dist);
         if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
-            !(*skippable))
-          rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, *sse));
+            !rd_stats->skip)
+          rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse));
       }
     } else {
       rd = INT64_MAX;
     }
 
-    if (rate_rect_tx != INT_MAX) {
-      rate_rect_tx += av1_cost_bit(cm->fc->rect_tx_prob[tx_size_cat], 1);
-      if (skippable_rect_tx) {
-        rd_rect_tx = RDCOST(x->rdmult, x->rddiv, s1, sse_rect_tx);
+    if (rect_rd_stats.rate != INT_MAX) {
+      rect_rd_stats.rate += av1_cost_bit(cm->fc->rect_tx_prob[tx_size_cat], 1);
+      if (rect_rd_stats.skip) {
+        rd_rect_tx = RDCOST(x->rdmult, x->rddiv, s1, rect_rd_stats.sse);
       } else {
-        rd_rect_tx =
-            RDCOST(x->rdmult, x->rddiv, rate_rect_tx + s0, dist_rect_tx);
+        rd_rect_tx = RDCOST(x->rdmult, x->rddiv, rect_rd_stats.rate + s0,
+                            rect_rd_stats.dist);
         if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
-            !(skippable_rect_tx))
-          rd_rect_tx =
-              AOMMIN(rd_rect_tx, RDCOST(x->rdmult, x->rddiv, s1, sse_rect_tx));
+            !(rect_rd_stats.skip))
+          rd_rect_tx = AOMMIN(
+              rd_rect_tx, RDCOST(x->rdmult, x->rddiv, s1, rect_rd_stats.sse));
       }
     } else {
       rd_rect_tx = INT64_MAX;
     }
 
     if (rd_rect_tx < rd) {
-      *rate = rate_rect_tx;
-      *dist = dist_rect_tx;
-      *sse = sse_rect_tx;
-      *skippable = skippable_rect_tx;
-      if (!xd->lossless[mbmi->segment_id]) x->blk_skip[0][0] = *skippable;
+      *rd_stats = rect_rd_stats;
+      if (!xd->lossless[mbmi->segment_id]) x->blk_skip[0][0] = rd_stats->skip;
       mbmi->tx_size = tx_size;
       mbmi->inter_tx_size[0][0] = mbmi->tx_size;
     } else {
@@ -3315,7 +3312,7 @@
   }
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
 
-  if (*rate == INT_MAX) return INT64_MAX;
+  if (rd_stats->rate == INT_MAX) return INT64_MAX;
 
 #if CONFIG_EXT_TX
   if (get_ext_tx_types(mbmi->tx_size, bsize, is_inter) > 1 &&
@@ -3323,41 +3320,41 @@
     int ext_tx_set = get_ext_tx_set(mbmi->tx_size, bsize, is_inter);
     if (is_inter) {
       if (ext_tx_set > 0)
-        *rate +=
+        rd_stats->rate +=
             cpi->inter_tx_type_costs[ext_tx_set][txsize_sqr_map[mbmi->tx_size]]
                                     [mbmi->tx_type];
     } else {
       if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
-        *rate += cpi->intra_tx_type_costs[ext_tx_set][mbmi->tx_size][mbmi->mode]
-                                         [mbmi->tx_type];
+        rd_stats->rate += cpi->intra_tx_type_costs[ext_tx_set][mbmi->tx_size]
+                                                  [mbmi->mode][mbmi->tx_type];
     }
   }
 #else   // CONFIG_EXT_TX
   if (mbmi->tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
     if (is_inter)
-      *rate += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
+      rd_stats->rate += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
     else
-      *rate +=
+      rd_stats->rate +=
           cpi->intra_tx_type_costs[mbmi->tx_size]
                                   [intra_mode_to_tx_type_context[mbmi->mode]]
                                   [mbmi->tx_type];
   }
 #endif  // CONFIG_EXT_TX
 
-  if (*skippable)
-    rd = RDCOST(x->rdmult, x->rddiv, s1, *sse);
+  if (rd_stats->skip)
+    rd = RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse);
   else
-    rd = RDCOST(x->rdmult, x->rddiv, *rate + s0, *dist);
+    rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate + s0, rd_stats->dist);
 
-  if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] && !(*skippable))
-    rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, *sse));
+  if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+      !(rd_stats->skip))
+    rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse));
 
   return rd;
 }
 
-static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int *rate,
-                               int64_t *distortion, int *skippable,
-                               int64_t *sse, BLOCK_SIZE bsize,
+static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                               RD_STATS *rd_stats, BLOCK_SIZE bsize,
                                int64_t ref_best_rd) {
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -3383,16 +3380,17 @@
     prune = prune_tx_types(cpi, bsize, x, xd, 0);
 #endif
 
-  *distortion = INT64_MAX;
-  *rate = INT_MAX;
-  *skippable = 0;
-  *sse = INT64_MAX;
+  rd_stats->dist = INT64_MAX;
+  rd_stats->rate = INT_MAX;
+  rd_stats->skip = 0;
+  rd_stats->sse = INT64_MAX;
 
   for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
-    int this_rate = 0;
-    int this_skip = 1;
-    int64_t this_dist = 0;
-    int64_t this_sse = 0;
+    RD_STATS this_rd_stats;
+    this_rd_stats.rate = 0;
+    this_rd_stats.skip = 1;
+    this_rd_stats.dist = 0;
+    this_rd_stats.sse = 0;
 #if CONFIG_EXT_TX
     if (is_inter) {
       if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
@@ -3415,15 +3413,12 @@
         tx_type != get_default_tx_type(0, xd, 0, max_tx_size))
       continue;
 
-    rd = select_tx_size_fix_type(cpi, x, &this_rate, &this_dist, &this_skip,
-                                 &this_sse, bsize, ref_best_rd, tx_type);
+    rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
+                                 tx_type);
 
     if (rd < best_rd) {
       best_rd = rd;
-      *distortion = this_dist;
-      *rate = this_rate;
-      *skippable = this_skip;
-      *sse = this_sse;
+      *rd_stats = this_rd_stats;
       best_tx_type = mbmi->tx_type;
       best_tx = mbmi->tx_size;
       memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4);
@@ -3444,8 +3439,7 @@
 static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
                         int blk_col, int plane, int block, TX_SIZE tx_size,
                         BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *above_ctx,
-                        ENTROPY_CONTEXT *left_ctx, int *rate, int64_t *dist,
-                        int64_t *bsse, int *skip) {
+                        ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[plane];
@@ -3473,7 +3467,7 @@
     ENTROPY_CONTEXT *tl = left_ctx + blk_row;
     coeff_ctx = get_entropy_context(tx_size, ta, tl);
     av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
-                      plane_bsize, coeff_ctx, rate, dist, bsse, skip);
+                      plane_bsize, coeff_ctx, rd_stats);
 
     for (i = 0; i < tx_size_wide_unit[tx_size]; ++i)
       ta[i] = !(p->eobs[block] == 0);
@@ -3492,16 +3486,16 @@
       int offsetc = (i & 0x01) * bsl;
       tx_block_rd(cpi, x, blk_row + offsetr, blk_col + offsetc, plane,
                   block + i * step, sub_txs, plane_bsize, above_ctx, left_ctx,
-                  rate, dist, bsse, skip);
+                  rd_stats);
     }
   }
 }
 
 // Return value 0: early termination triggered, no valid rd cost available;
 //              1: rd cost values are valid.
-static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, int *rate,
-                            int64_t *distortion, int *skippable, int64_t *sse,
-                            BLOCK_SIZE bsize, int64_t ref_best_rd) {
+static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                            RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                            int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   int plane;
@@ -3510,14 +3504,15 @@
 
   if (ref_best_rd < 0) is_cost_valid = 0;
 
-  *rate = 0;
-  *distortion = 0;
-  *sse = 0;
-  *skippable = 1;
+  rd_stats->rate = 0;
+  rd_stats->dist = 0;
+  rd_stats->sse = 0;
+  rd_stats->skip = 1;
 
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
   if (is_rect_tx(mbmi->tx_size)) {
-    return super_block_uvrd(cpi, x, rate, distortion, skippable, sse, bsize,
+    return super_block_uvrd(cpi, x, &rd_stats->rate, &rd_stats->dist,
+                            &rd_stats->skip, &rd_stats->sse, bsize,
                             ref_best_rd);
   }
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -3538,33 +3533,37 @@
     int idx, idy;
     int block = 0;
     const int step = bh * bw;
-    int pnrate = 0, pnskip = 1;
-    int64_t pndist = 0, pnsse = 0;
     ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE];
     ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE];
+    RD_STATS pn_rd_stats;
+    pn_rd_stats.rate = 0;
+    pn_rd_stats.skip = 1;
+    pn_rd_stats.dist = 0;
+    pn_rd_stats.sse = 0;
 
     av1_get_entropy_contexts(bsize, TX_4X4, pd, ta, tl);
 
     for (idy = 0; idy < mi_height; idy += bh) {
       for (idx = 0; idx < mi_width; idx += bw) {
         tx_block_rd(cpi, x, idy, idx, plane, block, max_tx_size, plane_bsize,
-                    ta, tl, &pnrate, &pndist, &pnsse, &pnskip);
+                    ta, tl, &pn_rd_stats);
         block += step;
       }
     }
 
-    if (pnrate == INT_MAX) {
+    if (pn_rd_stats.rate == INT_MAX) {
       is_cost_valid = 0;
       break;
     }
 
-    *rate += pnrate;
-    *distortion += pndist;
-    *sse += pnsse;
-    *skippable &= pnskip;
+    rd_stats->rate += pn_rd_stats.rate;
+    rd_stats->dist += pn_rd_stats.dist;
+    rd_stats->sse += pn_rd_stats.sse;
+    rd_stats->skip &= pn_rd_stats.skip;
 
-    this_rd = AOMMIN(RDCOST(x->rdmult, x->rddiv, *rate, *distortion),
-                     RDCOST(x->rdmult, x->rddiv, 0, *sse));
+    this_rd =
+        AOMMIN(RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist),
+               RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse));
 
     if (this_rd > ref_best_rd) {
       is_cost_valid = 0;
@@ -3574,10 +3573,10 @@
 
   if (!is_cost_valid) {
     // reset cost value
-    *rate = INT_MAX;
-    *distortion = INT64_MAX;
-    *sse = INT64_MAX;
-    *skippable = 0;
+    rd_stats->rate = INT_MAX;
+    rd_stats->dist = INT64_MAX;
+    rd_stats->sse = INT64_MAX;
+    rd_stats->skip = 0;
   }
 
   return is_cost_valid;
@@ -7493,13 +7492,21 @@
       int skippable_y, skippable_uv;
       int64_t sseuv = INT64_MAX;
       int64_t rdcosty = INT64_MAX;
+      int is_cost_valid_uv = 0;
+#if CONFIG_VAR_TX
+      RD_STATS rd_stats_uv;
+#endif
 
       // Y cost and distortion
       av1_subtract_plane(x, bsize, 0);
 #if CONFIG_VAR_TX
       if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
-        select_tx_type_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
-                           bsize, ref_best_rd);
+        RD_STATS rd_stats_y;
+        select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, ref_best_rd);
+        *rate_y = rd_stats_y.rate;
+        distortion_y = rd_stats_y.dist;
+        skippable_y = rd_stats_y.skip;
+        *psse = rd_stats_y.sse;
       } else {
         int idx, idy;
         super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
@@ -7537,20 +7544,25 @@
       rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
 
 #if CONFIG_VAR_TX
-      if (!inter_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
-                            &sseuv, bsize, ref_best_rd - rdcosty))
+      is_cost_valid_uv =
+          inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, ref_best_rd - rdcosty);
+      *rate_uv = rd_stats_uv.rate;
+      distortion_uv = rd_stats_uv.dist;
+      skippable_uv = rd_stats_uv.skip;
+      sseuv = rd_stats_uv.sse;
 #else
-    if (!super_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
-                          &sseuv, bsize, ref_best_rd - rdcosty))
+      is_cost_valid_uv =
+          super_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
+                           &sseuv, bsize, ref_best_rd - rdcosty);
 #endif  // CONFIG_VAR_TX
-      {
+      if (!is_cost_valid_uv) {
         *rate2 = INT_MAX;
         *distortion = INT64_MAX;
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
         continue;
 #else
-      restore_dst_buf(xd, orig_dst, orig_dst_stride);
-      return INT64_MAX;
+        restore_dst_buf(xd, orig_dst, orig_dst_stride);
+        return INT64_MAX;
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
       }
 
@@ -9201,6 +9213,9 @@
     }
 
     if (is_inter_mode(mbmi->mode)) {
+#if CONFIG_VAR_TX
+      RD_STATS rd_stats_uv;
+#endif
       av1_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
 #if CONFIG_MOTION_VAR
       if (mbmi->motion_mode == OBMC_CAUSAL)
@@ -9210,8 +9225,12 @@
       av1_subtract_plane(x, bsize, 0);
 #if CONFIG_VAR_TX
       if (cm->tx_mode == TX_MODE_SELECT || xd->lossless[mbmi->segment_id]) {
-        select_tx_type_yrd(cpi, x, &rate_y, &dist_y, &skip_y, &sse_y, bsize,
-                           INT64_MAX);
+        RD_STATS rd_stats_y;
+        select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+        rate_y = rd_stats_y.rate;
+        dist_y = rd_stats_y.dist;
+        sse_y = rd_stats_y.sse;
+        skip_y = rd_stats_y.skip;
       } else {
         int idx, idy;
         super_block_yrd(cpi, x, &rate_y, &dist_y, &skip_y, &sse_y, bsize,
@@ -9223,8 +9242,11 @@
                sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
       }
 
-      inter_block_uvrd(cpi, x, &rate_uv, &dist_uv, &skip_uv, &sse_uv, bsize,
-                       INT64_MAX);
+      inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+      rate_uv = rd_stats_uv.rate;
+      dist_uv = rd_stats_uv.dist;
+      skip_uv = rd_stats_uv.skip;
+      sse_uv = rd_stats_uv.sse;
 #else
       super_block_yrd(cpi, x, &rate_y, &dist_y, &skip_y, &sse_y, bsize,
                       INT64_MAX);
@@ -9424,7 +9446,10 @@
 #endif  // CONFIG_REF_MV
 #if CONFIG_GLOBAL_MOTION
     zeromv[0].as_int = cm->global_motion[refs[0]].motion_params.wmmat[0].as_int;
-    zeromv[1].as_int = cm->global_motion[refs[1]].motion_params.wmmat[0].as_int;
+    if (comp_pred_mode) {
+      zeromv[1].as_int =
+          cm->global_motion[refs[1]].motion_params.wmmat[0].as_int;
+    }
 #else
     zeromv[0].as_int = 0;
     zeromv[1].as_int = 0;
@@ -10354,16 +10379,24 @@
       if (tmp_best_rdu > 0) {
         // If even the 'Y' rd value of split is higher than best so far
         // then dont bother looking at UV
+        int is_cost_valid_uv;
+#if CONFIG_VAR_TX
+        RD_STATS rd_stats_uv;
+#endif
         av1_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, BLOCK_8X8);
 #if CONFIG_VAR_TX
-        if (!inter_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
-                              &uv_sse, BLOCK_8X8, tmp_best_rdu))
-          continue;
+        is_cost_valid_uv =
+            inter_block_uvrd(cpi, x, &rd_stats_uv, BLOCK_8X8, tmp_best_rdu);
+        rate_uv = rd_stats_uv.rate;
+        distortion_uv = rd_stats_uv.dist;
+        uv_skippable = rd_stats_uv.skip;
+        uv_sse = rd_stats_uv.sse;
 #else
-        if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
-                              &uv_sse, BLOCK_8X8, tmp_best_rdu))
-          continue;
+        is_cost_valid_uv =
+            super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
+                             &uv_sse, BLOCK_8X8, tmp_best_rdu);
 #endif
+        if (!is_cost_valid_uv) continue;
         rate2 += rate_uv;
         distortion2 += distortion_uv;
         skippable = skippable && uv_skippable;
diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index 5d9fc12..8c65770 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h
@@ -26,6 +26,16 @@
 struct macroblock;
 struct RD_COST;
 
+#if CONFIG_VAR_TX
+// TODO(angiebird): Merge RD_COST and RD_STATS
+typedef struct RD_STATS {
+  int rate;
+  int64_t dist;
+  int64_t sse;
+  int skip;
+} RD_STATS;
+#endif
+
 int av1_cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
                     int block, int coeff_ctx, TX_SIZE tx_size,
                     const int16_t *scan, const int16_t *nb,
@@ -77,8 +87,7 @@
 #if CONFIG_VAR_TX
 void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
                        int blk_row, int blk_col, int plane, int block,
-                       int plane_bsize, int coeff_ctx, int *rate, int64_t *dist,
-                       int64_t *bsse, int *skip);
+                       int plane_bsize, int coeff_ctx, RD_STATS *rd_stats);
 #endif
 
 void av1_txfm_rd_in_plane_supertx(MACROBLOCK *x, const AV1_COMP *cpi, int *rate,