Merge changes Ia2dd6bb1,Id1220b03 into nextgenv2

* changes:
  transform tests: Avoid #if inside INSTANTIATE_TEST_CASE_P
  variance_test: Avoid #if inside INSTANTIATE_TEST_CASE_P
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index b6eeee5..f7eb141 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -400,15 +400,44 @@
   1, 16, 12, 2
 };
 static const int num_ext_tx_set_intra[EXT_TX_SETS_INTRA] = {
-  1, 12, 5
+  1, 7, 5
 };
 
 #if EXT_TX_SIZES == 4
 static INLINE int get_ext_tx_set(TX_SIZE tx_size, BLOCK_SIZE bs,
                                  int is_inter) {
   if (tx_size > TX_32X32 || bs < BLOCK_8X8) return 0;
+#if USE_REDUCED_TXSET_FOR_16X16
+  if (tx_size == TX_32X32)
+    return is_inter ? 3 - USE_MSKTX_FOR_32X32 : 0;
+  return (tx_size == TX_16X16 ? 2 : 1);
+#else
   if (tx_size == TX_32X32)
     return is_inter ? 3 - 2 * USE_MSKTX_FOR_32X32 : 0;
+  return (tx_size == TX_16X16 && !is_inter ? 2 : 1);
+#endif  // USE_REDUCED_TXSET_FOR_16X16
+}
+
+static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA][TX_SIZES] = {
+  { 0, 0, 0, 0, },  // unused
+  { 1, 1, 0, 0, },
+  { 0, 0, 1, 0, },
+};
+
+static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER][TX_SIZES] = {
+  { 0, 0, 0, 0, },  // unused
+  { 1, 1, (!USE_REDUCED_TXSET_FOR_16X16), USE_MSKTX_FOR_32X32, },
+  { 0, 0, USE_REDUCED_TXSET_FOR_16X16, 0, },
+  { 0, 0, 0, (!USE_MSKTX_FOR_32X32), },
+};
+
+#else  // EXT_TX_SIZES == 4
+
+static INLINE int get_ext_tx_set(TX_SIZE tx_size, BLOCK_SIZE bs,
+                                 int is_inter) {
+  (void) is_inter;
+  if (tx_size > TX_32X32 || bs < BLOCK_8X8) return 0;
+  if (tx_size == TX_32X32) return 0;
 #if USE_REDUCED_TXSET_FOR_16X16
   return (tx_size == TX_16X16 ? 2 : 1);
 #else
@@ -424,31 +453,8 @@
 
 static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER][TX_SIZES] = {
   { 0, 0, 0, 0, },  // unused
-  { 1, 1, 1, USE_MSKTX_FOR_32X32, },
-  { 0, 0, 0, 0, },
-  { 0, 0, 0, (!USE_MSKTX_FOR_32X32), },
-};
-
-#else  // EXT_TX_SIZES == 4
-
-static INLINE int get_ext_tx_set(TX_SIZE tx_size, BLOCK_SIZE bs,
-                                 int is_inter) {
-  (void) is_inter;
-  if (tx_size > TX_32X32 || bs < BLOCK_8X8) return 0;
-  if (tx_size == TX_32X32) return 0;
-  return tx_size == TX_16X16 ? 2 : 1;
-}
-
-static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA][TX_SIZES] = {
-  { 0, 0, 0, 0, },  // unused
-  { 1, 1, 0, 0, },
-  { 0, 0, 1, 0, },
-};
-
-static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER][TX_SIZES] = {
-  { 0, 0, 0, 0, },  // unused
-  { 1, 1, 0, 0, },
-  { 0, 0, 1, 0, },
+  { 1, 1, (!USE_REDUCED_TXSET_FOR_16X16), 0, },
+  { 0, 0, USE_REDUCED_TXSET_FOR_16X16, 0, },
   { 0, 0, 0, 1, },
 };
 #endif  // EXT_TX_SIZES == 4
@@ -456,7 +462,7 @@
 // Transform types used in each intra set
 static const int ext_tx_used_intra[EXT_TX_SETS_INTRA][TX_TYPES] = {
   {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-  {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0},
+  {1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0},
   {1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0},
 };
 
@@ -479,13 +485,10 @@
 #define ALLOW_FILTER_INTRA_MODES 1
 #define ANGLE_STEP 3
 #define MAX_ANGLE_DELTAS 3
-#define ANGLE_FAST_SEARCH 1
-#define ANGLE_SKIP_THRESH 0.10
-#define FILTER_FAST_SEARCH 1
 
 extern const int16_t dr_intra_derivative[270][2];
 
-static uint8_t mode_to_angle_map[INTRA_MODES] = {
+static const uint8_t mode_to_angle_map[INTRA_MODES] = {
     0, 90, 180, 45, 135, 111, 157, 203, 67, 0,
 };
 
@@ -715,14 +718,7 @@
 }
 
 static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
-#if CONFIG_EXT_INTER
-  return (is_inter_block(mbmi) &&
-          !(has_second_ref(mbmi) && get_wedge_bits(mbmi->sb_type) &&
-            mbmi->use_wedge_interinter) &&
-          !(is_interintra_pred(mbmi)));
-#else
   return (is_inter_block(mbmi));
-#endif  // CONFIG_EXT_INTER
 }
 #endif  // CONFIG_OBMC
 
diff --git a/vp10/common/entropymode.c b/vp10/common/entropymode.c
index f1c8e30..e622ebc 100644
--- a/vp10/common/entropymode.c
+++ b/vp10/common/entropymode.c
@@ -976,13 +976,8 @@
     -DCT_DCT, 4,
     6, 8,
     -V_DCT, -H_DCT,
-    10, 16,
-    12, 14,
+    -ADST_ADST, 10,
     -ADST_DCT, -DCT_ADST,
-    -FLIPADST_DCT, -DCT_FLIPADST,
-    18, 20,
-    -ADST_ADST, -FLIPADST_FLIPADST,
-    -ADST_FLIPADST, -FLIPADST_ADST,
   }, {
     -IDTX, 2,
     -DCT_DCT, 4,
@@ -1041,50 +1036,50 @@
     },
   }, {
     {
-      {   8, 176,  32, 128, 128, 128, 128, 128, 128, 128, 128, },
-      {  10,  28,  32, 128, 176, 192, 208, 128, 128, 128, 128, },
-      {  10,  28,  32, 128, 176, 192,  48, 128, 128, 128, 128, },
-      {   9, 160,  32, 128, 128, 128, 128, 128, 128, 128, 128, },
-      {   8,  28,  32, 128,  96, 128, 128, 128, 160, 192, 128, },
-      {   7,  28,  32, 128, 160, 176, 192, 128, 128, 128, 128, },
-      {   7,  20,  32, 128, 160, 176,  64, 128, 128, 128, 128, },
-      {  10,  23,  32, 128, 160, 176,  64, 128, 128, 128, 128, },
-      {   8,  29,  32, 128, 160, 176, 192, 128, 128, 128, 128, },
-      {   3,  20,  32, 128,  96, 128, 128, 128, 160, 192, 128, },
+      {   8, 224,  32, 128, 64, 128, },
+      {  10,  32,  32, 128, 16, 192, },
+      {  10,  32,  32, 128, 16,  64, },
+      {   9, 200,  32, 128, 64, 128, },
+      {   8,   8,  32, 128, 224, 128, },
+      {  10,  32,  32, 128, 16, 192, },
+      {  10,  32,  32, 128, 16,  64, },
+      {  10,  23,  32, 128, 80, 176, },
+      {  10,  23,  32, 128, 80, 176, },
+      {  10,  32,  32, 128, 16,  64, },
     }, {
-      {   2, 176,  32, 128, 128, 128, 128, 128, 128, 128, 128, },
-      {   4,  28,  32, 128, 176, 192, 208, 128, 128, 128, 128, },
-      {   4,  28,  32, 128, 176, 192,  48, 128, 128, 128, 128, },
-      {   8, 160,  32, 128, 128, 128, 128, 128, 128, 128, 128, },
-      {   2,  28,  32, 128,  96, 128, 128, 128, 160, 192, 128, },
-      {   3,  28,  32, 128, 160, 176, 192, 128, 128, 128, 128, },
-      {   3,  26,  32, 128, 160, 176,  64, 128, 128, 128, 128, },
-      {   9,  24,  32, 128, 160, 176,  64, 128, 128, 128, 128, },
-      {   5,  24,  32, 128, 160, 176, 192, 128, 128, 128, 128, },
-      {   2,  25,  32, 128,  96, 128, 128, 128, 160, 192, 128, },
+      {   8, 224, 32, 128,  64, 128, },
+      {  10,  32, 32, 128,  16, 192, },
+      {  10,  32, 32, 128,  16,  64, },
+      {   9, 200, 32, 128,  64, 128, },
+      {   8,   8, 32, 128, 224, 128, },
+      {  10,  32, 32, 128,  16, 192, },
+      {  10,  32, 32, 128,  16,  64, },
+      {  10,  23, 32, 128,  80, 176, },
+      {  10,  23, 32, 128,  80, 176, },
+      {  10,  32, 32, 128,  16,  64, },
     }, {
-      {   2, 176,  32, 128, 128, 128, 128, 128, 128, 128, 128, },
-      {   1,  28,  32, 128, 176, 192, 208, 128, 128, 128, 128, },
-      {   1,  28,  32, 128, 176, 192,  48, 128, 128, 128, 128, },
-      {   4, 160,  32, 128, 128, 128, 128, 128, 128, 128, 128, },
-      {   2,  28,  32, 128,  96, 128, 128, 128, 160, 192, 128, },
-      {   2,  28,  32, 128, 160, 176, 192, 128, 128, 128, 128, },
-      {   3,  29,  32, 128, 160, 176,  64, 128, 128, 128, 128, },
-      {   4,  27,  32, 128, 160, 176,  64, 128, 128, 128, 128, },
-      {   2,  34,  32, 128, 160, 176, 192, 128, 128, 128, 128, },
-      {   1,  25,  32, 128,  96, 128, 128, 128, 160, 192, 128, },
+      {   8, 224, 32, 128,  64, 128, },
+      {  10,  32, 32, 128,  16, 192, },
+      {  10,  32, 32, 128,  16,  64, },
+      {   9, 200, 32, 128,  64, 128, },
+      {   8,   8, 32, 128, 224, 128, },
+      {  10,  32, 32, 128,  16, 192, },
+      {  10,  32, 32, 128,  16,  64, },
+      {  10,  23, 32, 128,  80, 176, },
+      {  10,  23, 32, 128,  80, 176, },
+      {  10,  32, 32, 128,  16,  64, },
 #if EXT_TX_SIZES == 4
     }, {
-      {   2, 176,  32, 128, 128, 128, 128, 128, 128, 128, 128, },
-      {   1,  12,  32, 128, 160, 176, 192, 128, 128, 128, 128, },
-      {   1,  17,  32, 128, 160, 176,  64, 128, 128, 128, 128, },
-      {   4,  41,  32, 128, 128, 128, 128, 128, 128, 128, 128, },
-      {   2,  17,  32, 128,  96, 128, 128, 128, 160, 192, 128, },
-      {   2,  14,  32, 128, 160, 176, 192, 128, 128, 128, 128, },
-      {   3,  19,  32, 128, 160, 176,  64, 128, 128, 128, 128, },
-      {   4,  27,  32, 128, 160, 176,  64, 128, 128, 128, 128, },
-      {   2,  34,  32, 128, 160, 176, 192, 128, 128, 128, 128, },
-      {   1,  15,  32, 128,  96, 128, 128, 128, 160, 192, 128, },
+      {   8, 224, 32, 128,  64, 128, },
+      {  10,  32, 32, 128,  16, 192, },
+      {  10,  32, 32, 128,  16,  64, },
+      {   9, 200, 32, 128,  64, 128, },
+      {   8,   8, 32, 128, 224, 128, },
+      {  10,  32, 32, 128,  16, 192, },
+      {  10,  32, 32, 128,  16,  64, },
+      {  10,  23, 32, 128,  80, 176, },
+      {  10,  23, 32, 128,  80, 176, },
+      {  10,  32, 32, 128,  16,  64, },
 #endif
     },
   }, {
diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c
index 2d2563e..e5e2442 100644
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c
@@ -1402,6 +1402,19 @@
   }  // each mi in the left column
 }
 
+#if CONFIG_EXT_INTER
+void modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
+  if (is_interintra_pred(mbmi)) {
+    mbmi->ref_frame[1] = NONE;
+  } else if (has_second_ref(mbmi) && get_wedge_bits(mbmi->sb_type) &&
+             mbmi->use_wedge_interinter) {
+    mbmi->use_wedge_interinter = 0;
+    mbmi->ref_frame[1] = NONE;
+  }
+  return;
+}
+#endif  // CONFIG_EXT_INTER
+
 void vp10_build_prediction_by_above_preds(VP10_COMMON *cm,
                                           MACROBLOCKD *xd,
                                           int mi_row, int mi_col,
@@ -1420,6 +1433,9 @@
     MODE_INFO *above_mi = xd->mi[mi_col_offset +
                                  mi_row_offset * xd->mi_stride];
     MB_MODE_INFO *above_mbmi = &above_mi->mbmi;
+#if CONFIG_EXT_INTER
+    MB_MODE_INFO backup_mbmi;
+#endif  // CONFIG_EXT_INTER
 
     mi_step = VPXMIN(xd->n8_w,
                      num_8x8_blocks_wide_lookup[above_mbmi->sb_type]);
@@ -1427,6 +1443,11 @@
     if (!is_neighbor_overlappable(above_mbmi))
       continue;
 
+#if CONFIG_EXT_INTER
+    backup_mbmi = *above_mbmi;
+    modify_neighbor_predictor_for_obmc(above_mbmi);
+#endif  // CONFIG_EXT_INTER
+
     for (j = 0; j < MAX_MB_PLANE; ++j) {
       struct macroblockd_plane *const pd = &xd->plane[j];
       setup_pred_plane(&pd->dst,
@@ -1488,6 +1509,9 @@
                                mi_x, mi_y);
       }
     }
+#if CONFIG_EXT_INTER
+    *above_mbmi = backup_mbmi;
+#endif  // CONFIG_EXT_INTER
   }
   xd->mb_to_left_edge   = -((mi_col * MI_SIZE) * 8);
 }
@@ -1513,6 +1537,9 @@
                                 mi_row_offset * xd->mi_stride];
     MB_MODE_INFO *left_mbmi = &left_mi->mbmi;
     const int is_compound = has_second_ref(left_mbmi);
+#if CONFIG_EXT_INTER
+    MB_MODE_INFO backup_mbmi;
+#endif  // CONFIG_EXT_INTER
 
     mi_step = VPXMIN(xd->n8_h,
                      num_8x8_blocks_high_lookup[left_mbmi->sb_type]);
@@ -1520,6 +1547,11 @@
     if (!is_neighbor_overlappable(left_mbmi))
       continue;
 
+#if CONFIG_EXT_INTER
+    backup_mbmi = *left_mbmi;
+    modify_neighbor_predictor_for_obmc(left_mbmi);
+#endif  // CONFIG_EXT_INTER
+
     for (j = 0; j < MAX_MB_PLANE; ++j) {
       struct macroblockd_plane *const pd = &xd->plane[j];
       setup_pred_plane(&pd->dst,
@@ -1581,6 +1613,9 @@
                                mi_x, mi_y);
       }
     }
+#if CONFIG_EXT_INTER
+    *left_mbmi = backup_mbmi;
+#endif  // CONFIG_EXT_INTER
   }
   xd->mb_to_top_edge    = -((mi_row * MI_SIZE) * 8);
 }
diff --git a/vp10/common/x86/vp10_txfm1d_sse4.h b/vp10/common/x86/vp10_txfm1d_sse4.h
index 803b86d..86ab660 100644
--- a/vp10/common/x86/vp10_txfm1d_sse4.h
+++ b/vp10/common/x86/vp10_txfm1d_sse4.h
@@ -81,33 +81,32 @@
   }
 }
 
-#define round_shift_32_sse4_1(vec, bit)     \
-  ({                                        \
-    __m128i tmp, round;                     \
-    round = _mm_set1_epi32(1 << (bit - 1)); \
-    tmp = _mm_add_epi32(vec, round);        \
-    _mm_srai_epi32(tmp, bit);               \
-  })
+static INLINE __m128i round_shift_32_sse4_1(__m128i vec, int bit) {
+  __m128i tmp, round;
+  round = _mm_set1_epi32(1 << (bit - 1));
+  tmp = _mm_add_epi32(vec, round);
+  return _mm_srai_epi32(tmp, bit);
+}
 
-#define round_shift_array_32_sse4_1(input, output, size, bit) \
-  ({                                                          \
-    if (bit > 0) {                                            \
-      int i;                                                  \
-      for (i = 0; i < size; i++) {                            \
-        output[i] = round_shift_32_sse4_1(input[i], bit);     \
-      }                                                       \
-    } else {                                                  \
-      int i;                                                  \
-      for (i = 0; i < size; i++) {                            \
-        output[i] = _mm_slli_epi32(input[i], -bit);           \
-      }                                                       \
-    }                                                         \
-  })
+static INLINE void round_shift_array_32_sse4_1(__m128i *input, __m128i *output,
+                                               const int size, const int bit) {
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = round_shift_32_sse4_1(input[i], bit);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = _mm_slli_epi32(input[i], -bit);
+    }
+  }
+}
 
 // out0 = in0*w0 + in1*w1
 // out1 = -in1*w0 + in0*w1
 #define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
-  ({                                                           \
+  do {                                                         \
     __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
     ww0 = _mm_set1_epi32(w0);                                  \
     ww1 = _mm_set1_epi32(w1);                                  \
@@ -119,12 +118,12 @@
     in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
     out1 = _mm_sub_epi32(in0_w1, in1_w0);                      \
     out1 = round_shift_32_sse4_1(out1, bit);                   \
-  })
+  } while (0)
 
 // out0 = in0*w0 + in1*w1
 // out1 = in1*w0 - in0*w1
 #define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
-  ({                                                           \
+  do {                                                         \
     __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
     ww0 = _mm_set1_epi32(w0);                                  \
     ww1 = _mm_set1_epi32(w1);                                  \
@@ -136,7 +135,7 @@
     in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
     out1 = _mm_sub_epi32(in1_w0, in0_w1);                      \
     out1 = round_shift_32_sse4_1(out1, bit);                   \
-  })
+  } while (0)
 
 #ifdef __cplusplus
 }
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 49ad376..004f49c 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -28,7 +28,9 @@
 #include "vp10/common/seg_common.h"
 #include "vp10/common/tile_common.h"
 
+#if CONFIG_ANS
 #include "vp10/encoder/buf_ans.h"
+#endif  // CONFIG_ANS
 #include "vp10/encoder/cost.h"
 #include "vp10/encoder/bitstream.h"
 #include "vp10/encoder/encodemv.h"
diff --git a/vp10/encoder/buf_ans.h b/vp10/encoder/buf_ans.h
index 11055d9..8697ee4 100644
--- a/vp10/encoder/buf_ans.h
+++ b/vp10/encoder/buf_ans.h
@@ -52,7 +52,7 @@
 }
 
 static INLINE void buf_uabs_write(struct BufAnsCoder *const c,
-                             uint8_t val, AnsP8 prob) {
+                                  uint8_t val, AnsP8 prob) {
   assert(c->offset <= c->size);
   if (c->offset == c->size) {
     vp10_buf_ans_grow(c);
@@ -85,7 +85,8 @@
       sym.cum_prob = c->buf[offset].val_start;
       rans_write(ans, &sym);
     } else {
-      uabs_write(ans, c->buf[offset].val_start, c->buf[offset].prob);
+      uabs_write(ans, (uint8_t)c->buf[offset].val_start,
+                 (AnsP8)c->buf[offset].prob);
     }
   }
 }
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index 437b366..21ed965 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -833,14 +833,18 @@
     cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 32);
     cm->tile_width  <<= MAX_MIB_SIZE_LOG2;
     cm->tile_height <<= MAX_MIB_SIZE_LOG2;
-  } else
-#endif  // CONFIG_EXT_PARTITION
-  {
+  } else {
     cm->tile_width  = clamp(cpi->oxcf.tile_columns, 1, 64);
     cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64);
     cm->tile_width  <<= MAX_MIB_SIZE_LOG2 - 1;
     cm->tile_height <<= MAX_MIB_SIZE_LOG2 - 1;
   }
+#else
+  cm->tile_width  = clamp(cpi->oxcf.tile_columns, 1, 64);
+  cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64);
+  cm->tile_width  <<= MAX_MIB_SIZE_LOG2;
+  cm->tile_height <<= MAX_MIB_SIZE_LOG2;
+#endif  // CONFIG_EXT_PARTITION
 
   cm->tile_width  = VPXMIN(cm->tile_width, cm->mi_cols);
   cm->tile_height = VPXMIN(cm->tile_height, cm->mi_rows);
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 0d2dada..d7a0f1a 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -81,6 +81,12 @@
 #define MIN_EARLY_TERM_INDEX    3
 #define NEW_MV_DISCOUNT_FACTOR  8
 
+#if CONFIG_EXT_INTRA
+#define ANGLE_FAST_SEARCH 1
+#define ANGLE_SKIP_THRESH 10
+#define FILTER_FAST_SEARCH 1
+#endif  // CONFIG_EXT_INTRA
+
 const double ADST_FLIP_SVM[8] = {-6.6623, -2.8062, -3.2531, 3.1671,  // vert
                                  -7.7051, -3.2234, -3.6193, 3.4533};  // horz
 
@@ -2531,99 +2537,134 @@
   return best_rd;
 }
 
-static INLINE int get_angle_index(double angle) {
-  const double step = 22.5, base = 45;
-  return (int)lround((angle - base) / step);
-}
+// Indices are sign, integer, and fractional part of the gradient value
+static const uint8_t gradient_to_angle_bin[2][7][16] = {
+    {
+        {6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, },
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, },
+        {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, },
+        {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, },
+        {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, },
+        {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, },
+        {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, },
+    },
+    {
+        {6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, },
+        {4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, },
+        {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, },
+        {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, },
+        {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, },
+        {3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, },
+        {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, },
+    },
+};
+
+static const uint8_t mode_to_angle_bin[INTRA_MODES] = {
+    0, 2, 6, 0, 4, 3, 5, 7, 1, 0,
+};
 
 static void angle_estimation(const uint8_t *src, int src_stride,
-                             int rows, int cols, double *hist) {
-  int r, c, i, index;
-  double angle, dx, dy;
-  double temp, divisor;
+                             int rows, int cols,
+                             uint8_t *directional_mode_skip_mask) {
+  int i, r, c, index, dx, dy, temp, sn, remd, quot;
+  uint64_t hist[DIRECTIONAL_MODES];
+  uint64_t hist_sum = 0;
 
-  vpx_clear_system_state();
-  divisor = 0;
-  for (i = 0; i < DIRECTIONAL_MODES; ++i)
-    hist[i] = 0;
-
+  memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
   src += src_stride;
   for (r = 1; r < rows; ++r) {
     for (c = 1; c < cols; ++c) {
       dx = src[c] - src[c - 1];
       dy = src[c] - src[c - src_stride];
       temp = dx * dx + dy * dy;
-      if (dy == 0)
-        angle = 90;
-      else
-        angle = (atan((double)dx / (double)dy)) * 180 / PI;
-      assert(angle >= -90 && angle <= 90);
-      index = get_angle_index(angle + 180);
-      if (index < DIRECTIONAL_MODES) {
-        hist[index] += temp;
-        divisor += temp;
+      if (dy == 0) {
+        index = 2;
+      } else {
+        sn = (dx > 0) ^ (dy > 0);
+        dx = abs(dx);
+        dy = abs(dy);
+        remd = dx % dy;
+        quot = dx / dy;
+        remd = remd * 16 / dy;
+        index = gradient_to_angle_bin[sn][VPXMIN(quot, 6)][VPXMIN(remd, 15)];
       }
-      if (angle > 0) {
-        index = get_angle_index(angle);
-        if (index >= 0) {
-          hist[index] += temp;
-          divisor += temp;
-        }
-      }
+      hist[index] += temp;
     }
     src += src_stride;
   }
 
-  if (divisor < 1)
-    divisor = 1;
   for (i = 0; i < DIRECTIONAL_MODES; ++i)
-    hist[i] /= divisor;
+    hist_sum += hist[i];
+  for (i = 0; i < INTRA_MODES; ++i) {
+    if (i != DC_PRED && i != TM_PRED) {
+      int index = mode_to_angle_bin[i];
+      uint64_t score = 2 * hist[index];
+      int weight = 2;
+      if (index > 0) {
+        score += hist[index - 1];
+        weight += 1;
+      }
+      if (index < DIRECTIONAL_MODES - 1) {
+        score += hist[index + 1];
+        weight += 1;
+      }
+      if (score * ANGLE_SKIP_THRESH  < hist_sum * weight)
+        directional_mode_skip_mask[i] = 1;
+    }
+  }
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
-                                    int rows, int cols, double *hist) {
-  int r, c, i, index;
-  double angle, dx, dy;
-  double temp, divisor;
+                                    int rows, int cols,
+                                    uint8_t *directional_mode_skip_mask) {
+  int i, r, c, index, dx, dy, temp, sn, remd, quot;
+  uint64_t hist[DIRECTIONAL_MODES];
+  uint64_t hist_sum = 0;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
 
-  vpx_clear_system_state();
-  divisor = 0;
-  for (i = 0; i < DIRECTIONAL_MODES; ++i)
-    hist[i] = 0;
-
+  memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
   src += src_stride;
   for (r = 1; r < rows; ++r) {
     for (c = 1; c < cols; ++c) {
       dx = src[c] - src[c - 1];
       dy = src[c] - src[c - src_stride];
       temp = dx * dx + dy * dy;
-      if (dy == 0)
-        angle = 90;
-      else
-        angle = (atan((double)dx / (double)dy)) * 180 / PI;
-      assert(angle >= -90 && angle <= 90);
-      index = get_angle_index(angle + 180);
-      if (index < DIRECTIONAL_MODES) {
-        hist[index] += temp;
-        divisor += temp;
+      if (dy == 0) {
+        index = 2;
+      } else {
+        sn = (dx > 0) ^ (dy > 0);
+        dx = abs(dx);
+        dy = abs(dy);
+        remd = dx % dy;
+        quot = dx / dy;
+        remd = remd * 16 / dy;
+        index = gradient_to_angle_bin[sn][VPXMIN(quot, 6)][VPXMIN(remd, 15)];
       }
-      if (angle > 0) {
-        index = get_angle_index(angle);
-        if (index >= 0) {
-          hist[index] += temp;
-          divisor += temp;
-        }
-      }
+      hist[index] += temp;
     }
     src += src_stride;
   }
 
-  if (divisor < 1)
-    divisor = 1;
   for (i = 0; i < DIRECTIONAL_MODES; ++i)
-    hist[i] /= divisor;
+    hist_sum += hist[i];
+  for (i = 0; i < INTRA_MODES; ++i) {
+    if (i != DC_PRED && i != TM_PRED) {
+      int index = mode_to_angle_bin[i];
+      uint64_t score = 2 * hist[index];
+      int weight = 2;
+      if (index > 0) {
+        score += hist[index - 1];
+        weight += 1;
+      }
+      if (index < DIRECTIONAL_MODES - 1) {
+        score += hist[index + 1];
+        weight += 1;
+      }
+      if (score * ANGLE_SKIP_THRESH  < hist_sum * weight)
+        directional_mode_skip_mask[i] = 1;
+    }
+  }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_EXT_INTRA
@@ -2649,7 +2690,6 @@
   uint8_t directional_mode_skip_mask[INTRA_MODES];
   const int src_stride = x->plane[0].src.stride;
   const uint8_t *src = x->plane[0].src.buf;
-  double hist[DIRECTIONAL_MODES];
 #endif  // CONFIG_EXT_INTRA
   TX_TYPE best_tx_type = DCT_DCT;
   int *bmode_costs;
@@ -2675,29 +2715,11 @@
          sizeof(directional_mode_skip_mask[0]) * INTRA_MODES);
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    highbd_angle_estimation(src, src_stride, rows, cols, hist);
+    highbd_angle_estimation(src, src_stride, rows, cols,
+                            directional_mode_skip_mask);
   else
 #endif
-    angle_estimation(src, src_stride, rows, cols, hist);
-
-  for (mode = 0; mode < INTRA_MODES; ++mode) {
-    if (mode != DC_PRED && mode != TM_PRED) {
-      int index = get_angle_index((double)mode_to_angle_map[mode]);
-      double score, weight = 1.0;
-      score = hist[index];
-      if (index > 0) {
-        score += hist[index - 1] * 0.5;
-        weight += 0.5;
-      }
-      if (index < DIRECTIONAL_MODES - 1) {
-        score += hist[index + 1] * 0.5;
-        weight += 0.5;
-      }
-      score /= weight;
-      if (score < ANGLE_SKIP_THRESH)
-        directional_mode_skip_mask[mode] = 1;
-    }
-  }
+    angle_estimation(src, src_stride, rows, cols, directional_mode_skip_mask);
 #endif  // CONFIG_EXT_INTRA
   memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
   palette_mode_info.palette_size[0] = 0;
@@ -6387,10 +6409,16 @@
           joint_motion_search(cpi, x, bsize, frame_mv,
                               mi_row, mi_col, NULL, single_newmv, &rate_mv, 0);
         } else {
+#if CONFIG_REF_MV
+          vp10_set_mvcost(x, mbmi->ref_frame[0]);
+#endif  // CONFIG_REF_MV
           rate_mv  = vp10_mv_bit_cost(&frame_mv[refs[0]].as_mv,
                                       &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
                                       x->nmvjointcost, x->mvcost,
                                       MV_COST_WEIGHT);
+#if CONFIG_REF_MV
+          vp10_set_mvcost(x, mbmi->ref_frame[1]);
+#endif  // CONFIG_REF_MV
           rate_mv += vp10_mv_bit_cost(&frame_mv[refs[1]].as_mv,
                                       &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv,
                                       x->nmvjointcost, x->mvcost,
@@ -6417,9 +6445,15 @@
                             mi_row, mi_col,
                             single_newmv, &rate_mv, 0);
       } else {
+#if CONFIG_REF_MV
+        vp10_set_mvcost(x, mbmi->ref_frame[0]);
+#endif  // CONFIG_REF_MV
         rate_mv  = vp10_mv_bit_cost(&frame_mv[refs[0]].as_mv,
                                    &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+#if CONFIG_REF_MV
+        vp10_set_mvcost(x, mbmi->ref_frame[1]);
+#endif  // CONFIG_REF_MV
         rate_mv += vp10_mv_bit_cost(&frame_mv[refs[1]].as_mv,
                                    &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
@@ -8138,33 +8172,14 @@
           const uint8_t *src = x->plane[0].src.buf;
           const int rows = 4 * num_4x4_blocks_high_lookup[bsize];
           const int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
-          double hist[DIRECTIONAL_MODES];
-          PREDICTION_MODE mode;
-
 #if CONFIG_VP9_HIGHBITDEPTH
           if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-            highbd_angle_estimation(src, src_stride, rows, cols, hist);
+            highbd_angle_estimation(src, src_stride, rows, cols,
+                                    directional_mode_skip_mask);
           else
 #endif
-            angle_estimation(src, src_stride, rows, cols, hist);
-          for (mode = 0; mode < INTRA_MODES; ++mode) {
-            if (mode != DC_PRED && mode != TM_PRED) {
-              int index = get_angle_index((double)mode_to_angle_map[mode]);
-              double score, weight = 1.0;
-              score = hist[index];
-              if (index > 0) {
-                score += hist[index - 1] * 0.5;
-                weight += 0.5;
-              }
-              if (index < DIRECTIONAL_MODES - 1) {
-                score += hist[index + 1] * 0.5;
-                weight += 0.5;
-              }
-              score /= weight;
-              if (score < ANGLE_SKIP_THRESH)
-                directional_mode_skip_mask[mode] = 1;
-            }
-          }
+            angle_estimation(src, src_stride, rows, cols,
+                             directional_mode_skip_mask);
           angle_stats_ready = 1;
         }
         if (directional_mode_skip_mask[mbmi->mode])
@@ -8433,8 +8448,10 @@
 
           if (!mv_check_bounds(x, &cur_mv.as_mv)) {
             INTERP_FILTER dummy_single_inter_filter[MB_MODE_COUNT]
-                                                   [MAX_REF_FRAMES];
-            int dummy_single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
+                                                   [MAX_REF_FRAMES] =
+                                          { { 0 } };
+            int dummy_single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES] =
+                                          { { 0 } };
             int dummy_disable_skip = 0;
 #if CONFIG_EXT_INTER
             int_mv dummy_single_newmvs[2][MAX_REF_FRAMES] =
diff --git a/vp10/encoder/speed_features.c b/vp10/encoder/speed_features.c
index cd1c91a..b3304a7 100644
--- a/vp10/encoder/speed_features.c
+++ b/vp10/encoder/speed_features.c
@@ -551,6 +551,12 @@
   else if (oxcf->mode == GOOD)
     set_good_speed_feature(cpi, cm, sf, oxcf->speed);
 
+#if CONFIG_REF_MV
+  // TODO(geza): Temporarily turn this off for ref-mv to fix tests.
+  //             Investigate/reimplement skip_recode better to enable this.
+  sf->allow_skip_recode = 0;
+#endif  // CONFIG_REF_MV
+
   // sf->partition_search_breakout_dist_thr is set assuming max 64x64
   // blocks. Normalise this if the blocks are bigger.
   if (MAX_SB_SIZE_LOG2 > 6) {