Add speed feature use_fast_interpolation_filter...

Applies to speed >=1. Instead of searching all dual filter space
{R,Sm,Sh}x{R,Sm,Sh}, only check {R}x{R,Sm,Sh} followed by
{R,Sm,Sh}x{best of prev R,Sm,Sh}.

Saves ~6% of cycles by reducing av1_convolve_2d_sse2, with 0.023
overall psnr drop.

Change-Id: I82d7a6321b335293124a007ff4c87f0e260052e1
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 844ee23..55010c2 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -7405,38 +7405,118 @@
       int best_in_temp = 0;
       InterpFilters best_filters = mbmi->interp_filters;
       restore_dst_buf(xd, *tmp_dst);
-      // EIGHTTAP_REGULAR mode is calculated beforehand
-      for (i = 1; i < filter_set_size; ++i) {
+
+#if CONFIG_DUAL_FILTER  // Speed feature use_fast_interpolation_filter_search
+      if (cpi->sf.use_fast_interpolation_filter_search) {
         int tmp_skip_sb = 0;
         int64_t tmp_skip_sse = INT64_MAX;
         int tmp_rs;
         int64_t tmp_rd;
+
+        // default to (R,R): EIGHTTAP_REGULARxEIGHTTAP_REGULAR
+        int best_dual_mode = 0;
+        // Find best of {R}x{R,Sm,Sh}
+        // EIGHTTAP_REGULAR mode is calculated beforehand
+        for (i = 1; i < SWITCHABLE_FILTERS; ++i) {
+          tmp_skip_sb = 0;
+          tmp_skip_sse = INT64_MAX;
+
+          mbmi->interp_filters =
+              av1_make_interp_filters(filter_sets[i][0], filter_sets[i][1]);
+
+          tmp_rs = av1_get_switchable_rate(cm, x, xd);
+          av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst,
+                                        bsize);
+          model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+                          &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
+          tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
+
+          if (tmp_rd < *rd) {
+            best_dual_mode = i;
+
+            *rd = tmp_rd;
+            *switchable_rate = av1_get_switchable_rate(cm, x, xd);
+            best_filters = mbmi->interp_filters;
+            *skip_txfm_sb = tmp_skip_sb;
+            *skip_sse_sb = tmp_skip_sse;
+            best_in_temp = !best_in_temp;
+            if (best_in_temp) {
+              restore_dst_buf(xd, *orig_dst);
+            } else {
+              restore_dst_buf(xd, *tmp_dst);
+            }
+          }
+        }
+
+        // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
+        for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size;
+             i += SWITCHABLE_FILTERS) {
+          tmp_skip_sb = 0;
+          tmp_skip_sse = INT64_MAX;
+
+          mbmi->interp_filters =
+              av1_make_interp_filters(filter_sets[i][0], filter_sets[i][1]);
+
+          tmp_rs = av1_get_switchable_rate(cm, x, xd);
+          av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst,
+                                        bsize);
+          model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+                          &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
+          tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
+
+          if (tmp_rd < *rd) {
+            *rd = tmp_rd;
+            *switchable_rate = av1_get_switchable_rate(cm, x, xd);
+            best_filters = mbmi->interp_filters;
+            *skip_txfm_sb = tmp_skip_sb;
+            *skip_sse_sb = tmp_skip_sse;
+            best_in_temp = !best_in_temp;
+            if (best_in_temp) {
+              restore_dst_buf(xd, *orig_dst);
+            } else {
+              restore_dst_buf(xd, *tmp_dst);
+            }
+          }
+        }
+      } else {
+#endif  // CONFIG_DUAL_FILTER Speed feature use_fast_interpolation_filter_search
+        // EIGHTTAP_REGULAR mode is calculated beforehand
+        for (i = 1; i < filter_set_size; ++i) {
+          int tmp_skip_sb = 0;
+          int64_t tmp_skip_sse = INT64_MAX;
+          int tmp_rs;
+          int64_t tmp_rd;
 #if CONFIG_DUAL_FILTER
-        mbmi->interp_filters =
-            av1_make_interp_filters(filter_sets[i][0], filter_sets[i][1]);
+          mbmi->interp_filters =
+              av1_make_interp_filters(filter_sets[i][0], filter_sets[i][1]);
 #else
         mbmi->interp_filters = av1_broadcast_interp_filter((InterpFilter)i);
 #endif  // CONFIG_DUAL_FILTER
-        tmp_rs = av1_get_switchable_rate(cm, x, xd);
-        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
-        model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
-                        &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
-        tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
+          tmp_rs = av1_get_switchable_rate(cm, x, xd);
+          av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst,
+                                        bsize);
+          model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+                          &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
+          tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
 
-        if (tmp_rd < *rd) {
-          *rd = tmp_rd;
-          *switchable_rate = av1_get_switchable_rate(cm, x, xd);
-          best_filters = mbmi->interp_filters;
-          *skip_txfm_sb = tmp_skip_sb;
-          *skip_sse_sb = tmp_skip_sse;
-          best_in_temp = !best_in_temp;
-          if (best_in_temp) {
-            restore_dst_buf(xd, *orig_dst);
-          } else {
-            restore_dst_buf(xd, *tmp_dst);
+          if (tmp_rd < *rd) {
+            *rd = tmp_rd;
+            *switchable_rate = av1_get_switchable_rate(cm, x, xd);
+            best_filters = mbmi->interp_filters;
+            *skip_txfm_sb = tmp_skip_sb;
+            *skip_sse_sb = tmp_skip_sse;
+            best_in_temp = !best_in_temp;
+            if (best_in_temp) {
+              restore_dst_buf(xd, *orig_dst);
+            } else {
+              restore_dst_buf(xd, *tmp_dst);
+            }
           }
         }
+#if CONFIG_DUAL_FILTER  // Speed feature use_fast_interpolation_filter_search
       }
+#endif  // CONFIG_DUAL_FILTER Speed feature use_fast_interpolation_filter_search
+
       if (best_in_temp) {
         restore_dst_buf(xd, *tmp_dst);
       } else {
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 4f75469..8833a7b 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -141,6 +141,9 @@
 #if CONFIG_EXT_PARTITION_TYPES
     sf->prune_ext_partition_types_search = 1;
 #endif  // CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_DUAL_FILTER
+    sf->use_fast_interpolation_filter_search = 1;
+#endif  // CONFIG_DUAL_FILTER
   }
 
   if (speed >= 2) {
@@ -453,6 +456,7 @@
   // Set this at the appropriate speed levels
   sf->use_transform_domain_distortion = 0;
   sf->gm_search_type = GM_FULL_SEARCH;
+  sf->use_fast_interpolation_filter_search = 0;
 
   if (oxcf->mode == GOOD
 #if CONFIG_XIPHRC
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index b2e5d6e..fedefaa 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -498,6 +498,10 @@
   int use_transform_domain_distortion;
 
   GM_SEARCH_TYPE gm_search_type;
+
+  // Do limited interpolation filter search for dual filters, since best choice
+  // usually includes EIGHTTAP_REGULAR.
+  int use_fast_interpolation_filter_search;
 } SPEED_FEATURES;
 
 struct AV1_COMP;