Add speed feature use_fast_interpolation_filter...

Applies to speed >=1. Instead of searching all dual filter space
{R,Sm,Sh}x{R,Sm,Sh}, only check {R}x{R,Sm,Sh} followed by
{R,Sm,Sh}x{best of prev R,Sm,Sh}.

Saves ~6% of cycles by reducing av1_convolve_2d_sse2, with 0.023
overall psnr drop.

Change-Id: I82d7a6321b335293124a007ff4c87f0e260052e1
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 844ee23..55010c2 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -7405,38 +7405,118 @@
       int best_in_temp = 0;
       InterpFilters best_filters = mbmi->interp_filters;
       restore_dst_buf(xd, *tmp_dst);
-      // EIGHTTAP_REGULAR mode is calculated beforehand
-      for (i = 1; i < filter_set_size; ++i) {
+
+#if CONFIG_DUAL_FILTER  // Speed feature use_fast_interpolation_filter_search
+      if (cpi->sf.use_fast_interpolation_filter_search) {
         int tmp_skip_sb = 0;
         int64_t tmp_skip_sse = INT64_MAX;
         int tmp_rs;
         int64_t tmp_rd;
+
+        // default to (R,R): EIGHTTAP_REGULARxEIGHTTAP_REGULAR
+        int best_dual_mode = 0;
+        // Find best of {R}x{R,Sm,Sh}
+        // EIGHTTAP_REGULAR mode is calculated beforehand
+        for (i = 1; i < SWITCHABLE_FILTERS; ++i) {
+          tmp_skip_sb = 0;
+          tmp_skip_sse = INT64_MAX;
+
+          mbmi->interp_filters =
+              av1_make_interp_filters(filter_sets[i][0], filter_sets[i][1]);
+
+          tmp_rs = av1_get_switchable_rate(cm, x, xd);
+          av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst,
+                                        bsize);
+          model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+                          &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
+          tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
+
+          if (tmp_rd < *rd) {
+            best_dual_mode = i;
+
+            *rd = tmp_rd;
+            *switchable_rate = av1_get_switchable_rate(cm, x, xd);
+            best_filters = mbmi->interp_filters;
+            *skip_txfm_sb = tmp_skip_sb;
+            *skip_sse_sb = tmp_skip_sse;
+            best_in_temp = !best_in_temp;
+            if (best_in_temp) {
+              restore_dst_buf(xd, *orig_dst);
+            } else {
+              restore_dst_buf(xd, *tmp_dst);
+            }
+          }
+        }
+
+        // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
+        for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size;
+             i += SWITCHABLE_FILTERS) {
+          tmp_skip_sb = 0;
+          tmp_skip_sse = INT64_MAX;
+
+          mbmi->interp_filters =
+              av1_make_interp_filters(filter_sets[i][0], filter_sets[i][1]);
+
+          tmp_rs = av1_get_switchable_rate(cm, x, xd);
+          av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst,
+                                        bsize);
+          model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+                          &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
+          tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
+
+          if (tmp_rd < *rd) {
+            *rd = tmp_rd;
+            *switchable_rate = av1_get_switchable_rate(cm, x, xd);
+            best_filters = mbmi->interp_filters;
+            *skip_txfm_sb = tmp_skip_sb;
+            *skip_sse_sb = tmp_skip_sse;
+            best_in_temp = !best_in_temp;
+            if (best_in_temp) {
+              restore_dst_buf(xd, *orig_dst);
+            } else {
+              restore_dst_buf(xd, *tmp_dst);
+            }
+          }
+        }
+      } else {
+#endif  // CONFIG_DUAL_FILTER Speed feature use_fast_interpolation_filter_search
+        // EIGHTTAP_REGULAR mode is calculated beforehand
+        for (i = 1; i < filter_set_size; ++i) {
+          int tmp_skip_sb = 0;
+          int64_t tmp_skip_sse = INT64_MAX;
+          int tmp_rs;
+          int64_t tmp_rd;
 #if CONFIG_DUAL_FILTER
-        mbmi->interp_filters =
-            av1_make_interp_filters(filter_sets[i][0], filter_sets[i][1]);
+          mbmi->interp_filters =
+              av1_make_interp_filters(filter_sets[i][0], filter_sets[i][1]);
 #else
         mbmi->interp_filters = av1_broadcast_interp_filter((InterpFilter)i);
 #endif  // CONFIG_DUAL_FILTER
-        tmp_rs = av1_get_switchable_rate(cm, x, xd);
-        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
-        model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
-                        &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
-        tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
+          tmp_rs = av1_get_switchable_rate(cm, x, xd);
+          av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst,
+                                        bsize);
+          model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+                          &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
+          tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
 
-        if (tmp_rd < *rd) {
-          *rd = tmp_rd;
-          *switchable_rate = av1_get_switchable_rate(cm, x, xd);
-          best_filters = mbmi->interp_filters;
-          *skip_txfm_sb = tmp_skip_sb;
-          *skip_sse_sb = tmp_skip_sse;
-          best_in_temp = !best_in_temp;
-          if (best_in_temp) {
-            restore_dst_buf(xd, *orig_dst);
-          } else {
-            restore_dst_buf(xd, *tmp_dst);
+          if (tmp_rd < *rd) {
+            *rd = tmp_rd;
+            *switchable_rate = av1_get_switchable_rate(cm, x, xd);
+            best_filters = mbmi->interp_filters;
+            *skip_txfm_sb = tmp_skip_sb;
+            *skip_sse_sb = tmp_skip_sse;
+            best_in_temp = !best_in_temp;
+            if (best_in_temp) {
+              restore_dst_buf(xd, *orig_dst);
+            } else {
+              restore_dst_buf(xd, *tmp_dst);
+            }
           }
         }
+#if CONFIG_DUAL_FILTER  // Speed feature use_fast_interpolation_filter_search
       }
+#endif  // CONFIG_DUAL_FILTER Speed feature use_fast_interpolation_filter_search
+
       if (best_in_temp) {
         restore_dst_buf(xd, *tmp_dst);
       } else {