Merge "Flexible support for various pattern searches"

diff --git a/configure b/configure
index 24be893..64f0165 100755
--- a/configure
+++ b/configure

@@ -314,6 +314,7 @@
     gprof
     gcov
     pic
+    use_x86inc
     optimizations
     ccache
     runtime_cpu_detect

diff --git a/test/sad_test.cc b/test/sad_test.cc
index bf3e0b8..b85da2f 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc

@@ -428,6 +428,7 @@
 
 #if HAVE_SSE
 #if CONFIG_VP9_ENCODER
+#if CONFIG_USE_X86INC
 const sad_m_by_n_fn_t sad_4x4_sse_vp9 = vp9_sad4x4_sse;
 const sad_m_by_n_fn_t sad_4x8_sse_vp9 = vp9_sad4x8_sse;
 INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::Values(
@@ -441,6 +442,7 @@
                         make_tuple(4, 4, sad_4x4x4d_sse)));
 #endif
 #endif
+#endif
 
 #if HAVE_SSE2
 #if CONFIG_VP8_ENCODER
@@ -451,6 +453,7 @@
 const sad_m_by_n_fn_t sad_4x4_wmt = vp8_sad4x4_wmt;
 #endif
 #if CONFIG_VP9_ENCODER
+#if CONFIG_USE_X86INC
 const sad_m_by_n_fn_t sad_64x64_sse2_vp9 = vp9_sad64x64_sse2;
 const sad_m_by_n_fn_t sad_64x32_sse2_vp9 = vp9_sad64x32_sse2;
 const sad_m_by_n_fn_t sad_32x64_sse2_vp9 = vp9_sad32x64_sse2;
@@ -463,6 +466,7 @@
 const sad_m_by_n_fn_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;
 const sad_m_by_n_fn_t sad_8x4_sse2_vp9 = vp9_sad8x4_sse2;
 #endif
+#endif
 const sad_m_by_n_test_param_t sse2_tests[] = {
 #if CONFIG_VP8_ENCODER
   make_tuple(16, 16, sad_16x16_wmt),
@@ -472,6 +476,7 @@
   make_tuple(4, 4, sad_4x4_wmt),
 #endif
 #if CONFIG_VP9_ENCODER
+#if CONFIG_USE_X86INC
   make_tuple(64, 64, sad_64x64_sse2_vp9),
   make_tuple(64, 32, sad_64x32_sse2_vp9),
   make_tuple(32, 64, sad_32x64_sse2_vp9),
@@ -484,6 +489,7 @@
   make_tuple(8, 8, sad_8x8_sse2_vp9),
   make_tuple(8, 4, sad_8x4_sse2_vp9),
 #endif
+#endif
 };
 INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 

diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index 238290b..a09558f 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h

@@ -57,12 +57,6 @@
 
 unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
 
-static INLINE const vp9_prob *vp9_get_pred_probs_switchable_interp(
-    const VP9_COMMON *cm, const MACROBLOCKD *xd) {
-  const int pred_context = vp9_get_pred_context_switchable_interp(xd);
-  return &cm->fc.switchable_interp_prob[pred_context][0];
-}
-
 unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd);
 
 static INLINE vp9_prob vp9_get_pred_prob_intra_inter(const VP9_COMMON *cm,

diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 798df0e..f61d26d 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -480,82 +480,82 @@
 specialize vp9_sub_pixel_avg_variance4x4 sse ssse3
 
 prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad64x64 sse2
+specialize vp9_sad64x64 $sse2_x86inc
 
 prototype unsigned int vp9_sad32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad32x64 sse2
+specialize vp9_sad32x64 $sse2_x86inc
 
 prototype unsigned int vp9_sad64x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad64x32 sse2
+specialize vp9_sad64x32 $sse2_x86inc
 
 prototype unsigned int vp9_sad32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad32x16 sse2
+specialize vp9_sad32x16 $sse2_x86inc
 
 prototype unsigned int vp9_sad16x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad16x32 sse2
+specialize vp9_sad16x32 $sse2_x86inc
 
 prototype unsigned int vp9_sad32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad32x32 sse2
+specialize vp9_sad32x32 $sse2_x86inc
 
 prototype unsigned int vp9_sad16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad16x16 mmx sse2
+specialize vp9_sad16x16 mmx $sse2_x86inc
 
 prototype unsigned int vp9_sad16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad16x8 mmx sse2
+specialize vp9_sad16x8 mmx $sse2_x86inc
 
 prototype unsigned int vp9_sad8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad8x16 mmx sse2
+specialize vp9_sad8x16 mmx $sse2_x86inc
 
 prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad8x8 mmx sse2
+specialize vp9_sad8x8 mmx $sse2_x86inc
 
 prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad8x4 sse2
+specialize vp9_sad8x4 $sse2_x86inc
 
 prototype unsigned int vp9_sad4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad4x8 sse
+specialize vp9_sad4x8 $sse_x86inc
 
 prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad4x4 mmx sse
+specialize vp9_sad4x4 mmx $sse_x86inc
 
 prototype unsigned int vp9_sad64x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad64x64_avg sse2
+specialize vp9_sad64x64_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad32x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad32x64_avg sse2
+specialize vp9_sad32x64_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad64x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad64x32_avg sse2
+specialize vp9_sad64x32_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad32x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad32x16_avg sse2
+specialize vp9_sad32x16_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad16x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad16x32_avg sse2
+specialize vp9_sad16x32_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad32x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad32x32_avg sse2
+specialize vp9_sad32x32_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad16x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad16x16_avg sse2
+specialize vp9_sad16x16_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad16x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad16x8_avg sse2
+specialize vp9_sad16x8_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad8x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad8x16_avg sse2
+specialize vp9_sad8x16_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad8x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad8x8_avg sse2
+specialize vp9_sad8x8_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad8x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad8x4_avg sse2
+specialize vp9_sad8x4_avg $sse2_x86inc
 
 prototype unsigned int vp9_sad4x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad4x8_avg sse
+specialize vp9_sad4x8_avg $sse_x86inc
 
 prototype unsigned int vp9_sad4x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad4x4_avg sse
+specialize vp9_sad4x4_avg $sse_x86inc
 
 prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance_halfpixvar16x16_h sse2

diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 2d9be6a..074b3e9 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c

@@ -370,9 +370,9 @@
     VP9D_COMP *pbi, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  const vp9_prob *probs = vp9_get_pred_probs_switchable_interp(cm, xd);
-  const int type = treed_read(r, vp9_switchable_interp_tree, probs);
   const int ctx = vp9_get_pred_context_switchable_interp(xd);
+  const int type = treed_read(r, vp9_switchable_interp_tree,
+                              cm->fc.switchable_interp_prob[ctx]);
   ++cm->counts.switchable_interp[ctx][type];
   return type;
 }

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index e79bc81..6a50c80 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c

@@ -477,12 +477,13 @@
       }
     }
 
-    if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+    if (pc->mcomp_filter_type == SWITCHABLE) {
+      const int ctx = vp9_get_pred_context_switchable_interp(xd);
       write_token(bc, vp9_switchable_interp_tree,
-                  vp9_get_pred_probs_switchable_interp(&cpi->common, xd),
+                  pc->fc.switchable_interp_prob[ctx],
                   &vp9_switchable_interp_encodings[mi->interp_filter]);
     } else {
-      assert(mi->interp_filter == cpi->common.mcomp_filter_type);
+      assert(mi->interp_filter == pc->mcomp_filter_type);
     }
 
     if (bsize < BLOCK_8X8) {

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 8bd3500..82859c5 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -333,13 +333,14 @@
 static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
                          BLOCK_SIZE_TYPE bsize, int output_enabled) {
   int i, x_idx, y;
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD * const xd = &x->e_mbd;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *mi = &ctx->mic;
-  MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
   int mb_mode_index = ctx->best_mode_index;
-  const int mis = cpi->common.mode_info_stride;
+  const int mis = cm->mode_info_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
 
@@ -351,15 +352,12 @@
 
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
-  for (y = 0; y < mi_height; y++) {
-    for (x_idx = 0; x_idx < mi_width; x_idx++) {
+  for (y = 0; y < mi_height; y++)
+    for (x_idx = 0; x_idx < mi_width; x_idx++)
       if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + mi_width > x_idx
-          && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + mi_height > y) {
-        MODE_INFO *mi_addr = xd->mode_info_context + x_idx + y * mis;
-        *mi_addr = *mi;
-      }
-    }
-  }
+          && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + mi_height > y)
+        xd->mode_info_context[x_idx + y * mis] = *mi;
+
   // FIXME(rbultje) I'm pretty sure this should go to the end of this block
   // (i.e. after the output_enabled)
   if (bsize < BLOCK_32X32) {
@@ -383,7 +381,7 @@
       cpi->rd_tx_select_diff[i] += ctx->tx_rd_diff[i];
   }
 
-  if (cpi->common.frame_type == KEY_FRAME) {
+  if (cm->frame_type == KEY_FRAME) {
     // Restore the coding modes to that held in the coding context
     // if (mb_mode == I4X4_PRED)
     //    for (i = 0; i < 16; i++)
@@ -436,19 +434,17 @@
             xd->mode_info_context[mis * j + i].mbmi = *mbmi;
     }
 
-    if (cpi->common.mcomp_filter_type == SWITCHABLE
-        && is_inter_mode(mbmi->mode)) {
-      ++cpi->common.counts.switchable_interp[
-          vp9_get_pred_context_switchable_interp(xd)][mbmi->interp_filter];
+    if (cm->mcomp_filter_type == SWITCHABLE && is_inter_mode(mbmi->mode)) {
+      const int ctx = vp9_get_pred_context_switchable_interp(xd);
+      ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
     }
 
     cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff;
     cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
     cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
 
-    for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+    for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
       cpi->rd_filter_diff[i] += ctx->best_filter_diff[i];
-    }
   }
 }
 

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 8990aa7..238c981 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -2427,13 +2427,12 @@
   return scaled_ref_frame;
 }
 
-static INLINE int get_switchable_rate(MACROBLOCK *x) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-
-  const int c = vp9_get_pred_context_switchable_interp(xd);
-  const int m = mbmi->interp_filter;
-  return SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
+static INLINE int get_switchable_rate(const MACROBLOCK *x) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  const int ctx = vp9_get_pred_context_switchable_interp(xd);
+  return SWITCHABLE_INTERP_RATE_FACTOR *
+             x->switchable_interp_costs[ctx][mbmi->interp_filter];
 }
 
 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 572a28d..5a96fba 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -80,7 +80,6 @@
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
@@ -89,6 +88,7 @@
 
 ifeq ($(USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
 endif