Merge "Rework sub8x8 chroma component inter predictor" into nextgenv2

diff --git a/test/vp10_fht16x16_test.cc b/test/vp10_fht16x16_test.cc
index 97f2c02..deccc81 100644
--- a/test/vp10_fht16x16_test.cc
+++ b/test/vp10_fht16x16_test.cc

@@ -132,7 +132,7 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   int i, j;
   const int stride = 16;
-  const int num_tests = 200000;
+  const int num_tests = 1000;
 
   for (i = 0; i < num_tests; ++i) {
     for (j = 0; j < num_coeffs_; ++j) {
@@ -208,6 +208,7 @@
     make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 2, 12),
     make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 3, 10),
     make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 3, 12),
+#if CONFIG_EXT_TX
     make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 4, 10),
     make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 4, 12),
     make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 5, 10),
@@ -218,7 +219,6 @@
     make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 7, 12),
     make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 8, 10),
     make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 8, 12),
-#if CONFIG_EXT_TX
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(

diff --git a/test/vp10_fht4x4_test.cc b/test/vp10_fht4x4_test.cc
index 1309827..c5a4382 100644
--- a/test/vp10_fht4x4_test.cc
+++ b/test/vp10_fht4x4_test.cc

@@ -132,7 +132,7 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   int i, j;
   const int stride = 4;
-  const int num_tests = 200000;
+  const int num_tests = 1000;
   const int num_coeffs = 16;
 
   for (i = 0; i < num_tests; ++i) {

diff --git a/test/vp10_fht8x8_test.cc b/test/vp10_fht8x8_test.cc
index 2c33939..da278c4 100644
--- a/test/vp10_fht8x8_test.cc
+++ b/test/vp10_fht8x8_test.cc

@@ -131,7 +131,7 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   int i, j;
   const int stride = 8;
-  const int num_tests = 200000;
+  const int num_tests = 1000;
   const int num_coeffs = 64;
 
   for (i = 0; i < num_tests; ++i) {

diff --git a/test/vp10_iht4x4_test.cc b/test/vp10_iht4x4_test.cc
index 1cad402..3960b5a 100644
--- a/test/vp10_iht4x4_test.cc
+++ b/test/vp10_iht4x4_test.cc

@@ -15,6 +15,7 @@
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 
 namespace {
@@ -34,24 +35,27 @@
 //   <target optimization function, tx_type, bit_depth>
 typedef tuple<IHbdHtFunc, int, int> IHbdHt4x4Param;
 
-class VP10HighbdInvTrans4x4HT :
-      public ::testing::TestWithParam<IHbdHt4x4Param> {
+class VP10HighbdInvTrans4x4HT
+    : public ::testing::TestWithParam<IHbdHt4x4Param> {
  public:
   virtual ~VP10HighbdInvTrans4x4HT() {}
 
   virtual void SetUp() {
     inv_txfm_ = GET_PARAM(0);
-    inv_txfm_ref_ = iht4x4_ref;
     tx_type_ = GET_PARAM(1);
     bit_depth_ = GET_PARAM(2);
     num_coeffs_ = 4 * 4;
 
+    // Note:
+    // Inverse transform input buffer is 32-byte aligned
+    // refer to function void alloc_mode_context() in
+    // vp10/encoder/context_tree.c
     coeffs_ = reinterpret_cast<int32_t *>(
-        vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
+        vpx_memalign(32, sizeof(coeffs_[0]) * num_coeffs_));
     output_ = reinterpret_cast<uint16_t *>(
-        vpx_memalign(16, sizeof(uint16_t) * num_coeffs_));
+        vpx_memalign(32, sizeof(output_[0]) * num_coeffs_));
     output_ref_ = reinterpret_cast<uint16_t *>(
-        vpx_memalign(16, sizeof(uint16_t) * num_coeffs_));
+        vpx_memalign(32, sizeof(output_ref_[0]) * num_coeffs_));
   }
 
   virtual void TearDown() {
@@ -65,49 +69,39 @@
   void RunBitexactCheck();
 
  private:
+  static int32_t ClampCoeffs(int number, int bit) {
+    const int max = (1 << bit) - 1;
+    const int min = -max;
+    return clamp(number, min, max);
+  }
+
   IHbdHtFunc inv_txfm_;
-  IHbdHtFunc inv_txfm_ref_;
   int tx_type_;
   int bit_depth_;
   int num_coeffs_;
   int32_t *coeffs_;
   uint16_t *output_;
   uint16_t *output_ref_;
-
-  int32_t clamp(int32_t number, int bit) {
-    int32_t ret = number;
-    const int32_t max = (int32_t)(1 << bit) - 1;
-    const int32_t min = -max;
-
-    if (number > max) {
-      ret = max;
-    } else if (number < min) {
-      ret = min;
-    }
-    return ret;
-  }
 };
 
 void VP10HighbdInvTrans4x4HT::RunBitexactCheck() {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int stride = 4;
   const int num_tests = 2000000;
-  int i;
-  int j;
   const uint16_t mask = (1 << bit_depth_) - 1;
 
-  for (i = 0; i < num_tests; ++i) {
-    for (j = 0; j < num_coeffs_; ++j) {
-      coeffs_[j] = clamp((rnd.Rand16() - rnd.Rand16()) << 2, 18);
+  for (int i = 0; i < num_tests; ++i) {
+    for (int j = 0; j < num_coeffs_; ++j) {
+      coeffs_[j] = ClampCoeffs((rnd.Rand16() - rnd.Rand16()) << 2, 18);
       output_ref_[j] = rnd.Rand16() & mask;
       output_[j] = output_ref_[j];
     }
 
-    inv_txfm_ref_(coeffs_, output_ref_, stride, tx_type_, bit_depth_);
+    iht4x4_ref(coeffs_, output_ref_, stride, tx_type_, bit_depth_);
     ASM_REGISTER_STATE_CHECK(inv_txfm_(coeffs_, output_, stride, tx_type_,
                                        bit_depth_));
 
-    for (j = 0; j < num_coeffs_; ++j) {
+    for (int j = 0; j < num_coeffs_; ++j) {
       EXPECT_EQ(output_ref_[j], output_[j])
           << "Not bit-exact result at index: " << j
           << "At test block: " << i;

diff --git a/test/vp10_inv_txfm2d_test.cc b/test/vp10_inv_txfm2d_test.cc
index eef95f0..fef4629 100644
--- a/test/vp10_inv_txfm2d_test.cc
+++ b/test/vp10_inv_txfm2d_test.cc

@@ -104,8 +104,6 @@
   TX_SIZE tx_size_;
   int txfm1d_size_;
   int txfm2d_size_;
-  Fwd_Txfm2d_Func fwd_txfm_;
-  Inv_Txfm2d_Func inv_txfm_;
   int16_t* input_;
   uint16_t* ref_input_;
   int32_t* output_;

diff --git a/vp10/common/idct.c b/vp10/common/idct.c
index a5d50bb..717c914 100644
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c

@@ -1297,7 +1297,8 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      vp10_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
+      vp10_inv_txfm2d_add_4x4_c(input, CONVERT_TO_SHORTPTR(dest), stride,
+                              tx_type, bd);
       break;
     case V_DCT:
     case H_DCT:
@@ -1336,7 +1337,8 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      vp10_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
+      vp10_inv_txfm2d_add_8x8_c(input, CONVERT_TO_SHORTPTR(dest), stride,
+                              tx_type, bd);
       break;
     case V_DCT:
     case H_DCT:
@@ -1375,7 +1377,8 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      vp10_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
+      vp10_inv_txfm2d_add_16x16_c(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                tx_type, bd);
       break;
     case V_DCT:
     case H_DCT:

diff --git a/vp10/common/x86/highbd_inv_txfm_sse4.c b/vp10/common/x86/highbd_inv_txfm_sse4.c
index 0c623df..80d4c4f 100644
--- a/vp10/common/x86/highbd_inv_txfm_sse4.c
+++ b/vp10/common/x86/highbd_inv_txfm_sse4.c

@@ -9,18 +9,17 @@
  */
 
 #include <assert.h>
-#include <smmintrin.h> /* SSE4.1 */
+#include <smmintrin.h>  /* SSE4.1 */
 
 #include "./vp10_rtcd.h"
 #include "./vpx_config.h"
 #include "vp10/common/vp10_inv_txfm2d_cfg.h"
 
-
 static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
-  in[0] = _mm_loadu_si128((const __m128i *)(coeff + 0));
-  in[1] = _mm_loadu_si128((const __m128i *)(coeff + 4));
-  in[2] = _mm_loadu_si128((const __m128i *)(coeff + 8));
-  in[3] = _mm_loadu_si128((const __m128i *)(coeff + 12));
+  in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+  in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+  in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+  in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
 }
 
 static void idct4x4_sse4_1(__m128i *in, int bit) {
@@ -176,7 +175,7 @@
 }
 
 static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
-                             int flipud, int fliplr, int shift, int bd) {
+                             int shift, int bd) {
   const __m128i zero = _mm_setzero_si128();
   __m128i u0, u1, u2, u3;
   __m128i v0, v1, v2, v3;
@@ -213,9 +212,6 @@
   _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
   _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
   _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
-
-  (void) flipud;
-  (void) fliplr;
 }
 
 void vp10_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
@@ -229,28 +225,28 @@
       load_buffer_4x4(coeff, in);
       idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
       idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
-      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      write_buffer_4x4(in, output, stride, -cfg->shift[1], bd);
       break;
     case ADST_DCT:
       cfg = &inv_txfm_2d_cfg_adst_dct_4;
       load_buffer_4x4(coeff, in);
       idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
       iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
-      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      write_buffer_4x4(in, output, stride, -cfg->shift[1], bd);
       break;
     case DCT_ADST:
       cfg = &inv_txfm_2d_cfg_dct_adst_4;
       load_buffer_4x4(coeff, in);
       iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
       idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
-      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      write_buffer_4x4(in, output, stride, -cfg->shift[1], bd);
       break;
     case ADST_ADST:
       cfg = &inv_txfm_2d_cfg_adst_adst_4;
       load_buffer_4x4(coeff, in);
       iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
       iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
-      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      write_buffer_4x4(in, output, stride, -cfg->shift[1], bd);
       break;
     default:
       assert(0);

diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index 67ebe6d..673a9e6 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h

@@ -659,7 +659,7 @@
 }
 
 static INLINE int get_ref_frame_buf_idx(const VP10_COMP *const cpi,
-                                        int ref_frame) {
+                                        MV_REFERENCE_FRAME ref_frame) {
   const VP10_COMMON *const cm = &cpi->common;
   const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
   return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
@@ -673,6 +673,14 @@
       buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf : NULL;
 }
 
+static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref(
+    VP10_COMP *cpi, const MV_REFERENCE_FRAME ref_frame) {
+  // Use up-sampled reference frames.
+  const int buf_idx =
+      cpi->upsampled_ref_idx[get_ref_frame_map_idx(cpi, ref_frame)];
+  return &cpi->upsampled_ref_bufs[buf_idx].buf;
+}
+
 static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols) {
   // TODO(JBB): double check we can't exceed this token count if we have a
   // 32x32 transform crossing a boundary at a multiple of 16.

diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 5e88d15..c27c887 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c

@@ -7619,6 +7619,19 @@
 #else
     int tmp_rate2 = rate2_nocoeff;
 #endif  // CONFIG_EXT_INTER
+#if CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+    INTERP_FILTER obmc_interp_filter[2][2] = {
+        {mbmi->interp_filter[0], mbmi->interp_filter[1]},   // obmc == 0
+        {mbmi->interp_filter[0], mbmi->interp_filter[1]}    // obmc == 1
+    };
+#else
+    INTERP_FILTER obmc_interp_filter[2] = {
+        mbmi->interp_filter,  // obmc == 0
+        mbmi->interp_filter   // obmc == 1
+    };
+#endif  // CONFIG_DUAL_FILTER
+#endif  // CONFIG_EXT_INTERP
 
     if (mbmi->obmc) {
 #if CONFIG_EXT_INTER
@@ -7647,6 +7660,21 @@
 #else
         tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
 #endif  // CONFIG_EXT_INTER
+#if CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+        if (!has_subpel_mv_component(xd->mi[0], xd, 0))
+          obmc_interp_filter[1][0] = mbmi->interp_filter[0] = EIGHTTAP_REGULAR;
+        if (!has_subpel_mv_component(xd->mi[0], xd, 1))
+          obmc_interp_filter[1][1] = mbmi->interp_filter[1] = EIGHTTAP_REGULAR;
+#else
+        if (!vp10_is_interp_needed(xd))
+          obmc_interp_filter[1] = mbmi->interp_filter = EIGHTTAP_REGULAR;
+#endif  // CONFIG_DUAL_FILTER
+        // This is not quite correct with CONFIG_DUAL_FILTER when a filter
+        // is needed in only one direction
+        if (!vp10_is_interp_needed(xd))
+          tmp_rate2 -= rs;
+#endif  // CONFIG_EXT_INTERP
         vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
 #if CONFIG_EXT_INTER
       } else {
@@ -7787,6 +7815,14 @@
 #if CONFIG_OBMC
     tmp_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
     if (mbmi->obmc == 0 || (tmp_rd < best_rd)) {
+#if CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+      mbmi->interp_filter[0] = obmc_interp_filter[mbmi->obmc][0];
+      mbmi->interp_filter[1] = obmc_interp_filter[mbmi->obmc][1];
+#else
+      mbmi->interp_filter = obmc_interp_filter[mbmi->obmc];
+#endif  // CONFIG_DUAL_FILTER
+#endif  // CONFIG_EXT_INTERP
       best_mbmi = *mbmi;
       best_rd = tmp_rd;
       best_rate2 = *rate2;

diff --git a/vp10/encoder/rdopt.h b/vp10/encoder/rdopt.h
index 2ca39a5..be6227b 100644
--- a/vp10/encoder/rdopt.h
+++ b/vp10/encoder/rdopt.h

@@ -90,24 +90,6 @@
                                    int use_fast_coef_casting);
 #endif  // CONFIG_SUPERTX
 
-static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref(VP10_COMP *cpi,
-                                                          const int ref) {
-  // Use up-sampled reference frames.
-  int ref_idx = 0;
-  if (ref == LAST_FRAME)
-#if CONFIG_EXT_REFS
-    ref_idx = cpi->lst_fb_idxes[ref - LAST_FRAME];
-#else
-    ref_idx = cpi->lst_fb_idx;
-#endif  // CONFIG_EXT_REFS
-  else if (ref == GOLDEN_FRAME)
-    ref_idx = cpi->gld_fb_idx;
-  else if (ref == ALTREF_FRAME)
-    ref_idx = cpi->alt_fb_idx;
-
-  return &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[ref_idx]].buf;
-}
-
 #if CONFIG_OBMC
 void calc_target_weighted_pred(VP10_COMMON *cm,
                                MACROBLOCK *x,

diff --git a/vp10/encoder/x86/highbd_fwd_txfm_sse4.c b/vp10/encoder/x86/highbd_fwd_txfm_sse4.c
index 2ad59cf..8b27f55 100644
--- a/vp10/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/vp10/encoder/x86/highbd_fwd_txfm_sse4.c

@@ -997,6 +997,7 @@
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
       cfg = &fwd_txfm_2d_cfg_adst_dct_8;
       load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]);
@@ -1047,6 +1048,7 @@
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
   }
@@ -1893,6 +1895,7 @@
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
       cfg = &fwd_txfm_2d_cfg_adst_dct_16;
       load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]);
@@ -1943,6 +1946,7 @@
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
   }

diff --git a/vpx_dsp/fwd_txfm.c b/vpx_dsp/fwd_txfm.c
index a5802e1..4c0d5db 100644
--- a/vpx_dsp/fwd_txfm.c
+++ b/vpx_dsp/fwd_txfm.c

@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/fwd_txfm.h"
 
 void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {

diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c
index 402fd9a..533f762 100644
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c

@@ -11,6 +11,7 @@
 #include <math.h>
 #include <string.h>
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/inv_txfm.h"
 
 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index 46ef646..645a1ab 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c

@@ -11,6 +11,7 @@
 #include <stdlib.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 

diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index e4e741a..6426ccc 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c

@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/quantize.h"
 #include "vpx_mem/vpx_mem.h"
 

diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
index 4df39df..951af3a 100644
--- a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h

@@ -10,6 +10,7 @@
 
 #include <immintrin.h>  // AVX2
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/txfm_common.h"
 
 #define pair256_set_epi16(a, b) \

diff --git a/vpx_dsp/x86/fwd_txfm_sse2.c b/vpx_dsp/x86/fwd_txfm_sse2.c
index e4deeec..3e4f49b 100644
--- a/vpx_dsp/x86/fwd_txfm_sse2.c
+++ b/vpx_dsp/x86/fwd_txfm_sse2.c

@@ -11,6 +11,7 @@
 #include <emmintrin.h>  // SSE2
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/x86/fwd_txfm_sse2.h"