Improve temporal filter prediction

In temporal filtering, applied high precision 12-tap filter in
prediction process. This gave a good coding gain.

Borg test result at speed 1:
        avg_psnr  ovr_psnr  ssim
lowres2: -0.526    -0.517  -0.337
midres2: -0.359    -0.349  -0.161
hdres2:  -0.168    -0.162  -0.083

TODO: Need to add 12-tap SIMD code, so the encoder slowness can
be mostly reduced.

STATS_CHANGED

Change-Id: I88c7dcd2c9afe00d52e5299b1330f42c6e1e01cb
(cherry picked from commit 144a941ec51cb998ea7923445e183bbaf048a777)
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 0a25396..716886f 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -561,15 +561,35 @@
   if (!need_x && !need_y) {
     aom_convolve_copy(src, src_stride, dst, dst_stride, w, h);
   } else if (need_x && !need_y) {
-    av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
-                      subpel_x_qn, conv_params);
+    // Filters with taps > 8 are only for encoder side use.
+    // TODO(any): need SIMD for > 8 taps filters
+    if (filter_params_x->taps > 8 || filter_params_y->taps > 8) {
+      av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+                          filter_params_x, subpel_x_qn, conv_params);
+    } else {
+      av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                        subpel_x_qn, conv_params);
+    }
   } else if (!need_x && need_y) {
-    av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
-                      subpel_y_qn);
+    if (filter_params_x->taps > 8 || filter_params_y->taps > 8) {
+      av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+                          filter_params_y, subpel_y_qn);
+    } else {
+      av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
+                        subpel_y_qn);
+    }
   } else {
     assert(need_x && need_y);
-    av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
-                       filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
+
+    if (filter_params_x->taps > 8 || filter_params_y->taps > 8) {
+      av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                           filter_params_x, filter_params_y, subpel_x_qn,
+                           subpel_y_qn, conv_params);
+    } else {
+      av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
+                         filter_params_x, filter_params_y, subpel_x_qn,
+                         subpel_y_qn, conv_params);
+    }
   }
 }
 
@@ -1064,19 +1084,43 @@
     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   const bool need_x = subpel_x_qn != 0;
   const bool need_y = subpel_y_qn != 0;
+  // Filters with taps > 8 are only for encoder side use.
+  const int filter_x_taps_gt8 =
+      (filter_params_x == NULL) ? 0 : ((filter_params_x->taps > 8) ? 1 : 0);
+  const int filter_y_taps_gt8 =
+      (filter_params_y == NULL) ? 0 : ((filter_params_y->taps > 8) ? 1 : 0);
+
   if (!need_x && !need_y) {
     aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h);
   } else if (need_x && !need_y) {
-    av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h,
-                             filter_params_x, subpel_x_qn, conv_params, bd);
+    // TODO(any): need SIMD for > 8 taps filters
+    if (filter_x_taps_gt8 || filter_y_taps_gt8) {
+      av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+                                 filter_params_x, subpel_x_qn, conv_params, bd);
+
+    } else {
+      av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h,
+                               filter_params_x, subpel_x_qn, conv_params, bd);
+    }
   } else if (!need_x && need_y) {
-    av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h,
-                             filter_params_y, subpel_y_qn, bd);
+    if (filter_x_taps_gt8 || filter_y_taps_gt8) {
+      av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+                                 filter_params_y, subpel_y_qn, bd);
+    } else {
+      av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h,
+                               filter_params_y, subpel_y_qn, bd);
+    }
   } else {
     assert(need_x && need_y);
-    av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
-                              filter_params_x, filter_params_y, subpel_x_qn,
-                              subpel_y_qn, conv_params, bd);
+    if (filter_x_taps_gt8 || filter_y_taps_gt8) {
+      av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                                  filter_params_x, filter_params_y, subpel_x_qn,
+                                  subpel_y_qn, conv_params, bd);
+    } else {
+      av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
+                                filter_params_x, filter_params_y, subpel_x_qn,
+                                subpel_y_qn, conv_params, bd);
+    }
   }
 }
 
diff --git a/av1/common/filter.h b/av1/common/filter.h
index 787e699..16a9450 100644
--- a/av1/common/filter.h
+++ b/av1/common/filter.h
@@ -32,6 +32,9 @@
   EIGHTTAP_SMOOTH,
   MULTITAP_SHARP,
   BILINEAR,
+  // Encoder side only filters
+  MULTITAP_SHARP2,
+
   INTERP_FILTERS_ALL,
   SWITCHABLE_FILTERS = BILINEAR,
   SWITCHABLE = SWITCHABLE_FILTERS + 1, /* the last switchable one */
@@ -166,14 +169,38 @@
   { 0, 0, 4, 36, 62, 26, 0, 0 },    { 0, 0, 2, 34, 62, 28, 2, 0 }
 };
 
+DECLARE_ALIGNED(256, static const int16_t,
+                av1_sub_pel_filters_12sharp[SUBPEL_SHIFTS][12]) = {
+  { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 },
+  { 0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0 },
+  { -1, 2, -3, 6, -13, 124, 18, -8, 4, -2, 2, -1 },
+  { -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1 },
+  { -1, 3, -6, 10, -21, 115, 38, -15, 8, -5, 3, -1 },
+  { -2, 4, -6, 12, -24, 108, 49, -18, 10, -6, 3, -2 },
+  { -2, 4, -7, 13, -25, 100, 60, -21, 11, -7, 4, -2 },
+  { -2, 4, -7, 13, -26, 91, 71, -24, 13, -7, 4, -2 },
+  { -2, 4, -7, 13, -25, 81, 81, -25, 13, -7, 4, -2 },
+  { -2, 4, -7, 13, -24, 71, 91, -26, 13, -7, 4, -2 },
+  { -2, 4, -7, 11, -21, 60, 100, -25, 13, -7, 4, -2 },
+  { -2, 3, -6, 10, -18, 49, 108, -24, 12, -6, 4, -2 },
+  { -1, 3, -5, 8, -15, 38, 115, -21, 10, -6, 3, -1 },
+  { -1, 2, -4, 7, -12, 28, 120, -18, 8, -4, 3, -1 },
+  { -1, 2, -2, 4, -8, 18, 124, -13, 6, -3, 2, -1 },
+  { 0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0 }
+};
+
 static const InterpFilterParams
-    av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
+    av1_interp_filter_params_list[INTERP_FILTERS_ALL] = {
       { (const int16_t *)av1_sub_pel_filters_8, SUBPEL_TAPS, EIGHTTAP_REGULAR },
       { (const int16_t *)av1_sub_pel_filters_8smooth, SUBPEL_TAPS,
         EIGHTTAP_SMOOTH },
       { (const int16_t *)av1_sub_pel_filters_8sharp, SUBPEL_TAPS,
         MULTITAP_SHARP },
-      { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, BILINEAR }
+      { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, BILINEAR },
+
+      // The following filters are for encoder only, and now they are used in
+      // temporal filtering. The predictor block size >= 16 in temporal filter.
+      { (const int16_t *)av1_sub_pel_filters_12sharp, 12, MULTITAP_SHARP2 },
     };
 
 // A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 77d4a23..922b9ae 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -325,10 +325,10 @@
 
   // Default interpolation filters.
 #if CONFIG_REMOVE_DUAL_FILTER
-  const InterpFilter interp_filters = MULTITAP_SHARP;
+  const InterpFilter interp_filters = MULTITAP_SHARP2;
 #else
   const int_interpfilters interp_filters =
-      av1_broadcast_interp_filter(MULTITAP_SHARP);
+      av1_broadcast_interp_filter(MULTITAP_SHARP2);
 #endif  // !CONFIG_REMOVE_DUAL_FILTER
 
   // Handle Y-plane, U-plane and V-plane (if needed) in sequence.
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index e87f914..ee4352b 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -20,6 +20,11 @@
 
 namespace {
 
+// TODO(any): Remove following INTERP_FILTERS_ALL define, so that 12-tap filter
+// is tested once 12-tap filter SIMD is done.
+#undef INTERP_FILTERS_ALL
+#define INTERP_FILTERS_ALL 4
+
 // All single reference convolve tests are parameterized on block size,
 // bit-depth, and function to test.
 //