Implement shorter-tap first in convolve_round The performance change is 0.004% on lowres Change-Id: If3702ba6377ac42997e7d49b8959ff16fb182daa

commit: 118bf67cb61f492023da085eb7446f6d9cbd598c [log] [tgz]
author: Angie Chiang <angiebird@google.com> Fri Feb 03 17:12:44 2017 -0800
committer: Angie Chiang <angiebird@google.com> Sun Feb 12 19:38:43 2017 +0000
tree: 830c87e71f65d3b909710484b135e9fbea7ab492
parent: befcc42572b88c6ff983d1000fa4eddc4bb41f26 [diff]
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 6cd24e3..1bd1ab4 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c

@@ -252,6 +252,24 @@
   }
 }
 
+static INLINE void transpose_uint8(uint8_t *dst, int dst_stride,
+                                   const uint8_t *src, int src_stride, int w,
+                                   int h) {
+  int r, c;
+  for (r = 0; r < h; ++r)
+    for (c = 0; c < w; ++c)
+      dst[c * (dst_stride) + r] = src[r * (src_stride) + c];
+}
+
+static INLINE void transpose_int32(int32_t *dst, int dst_stride,
+                                   const int32_t *src, int src_stride, int w,
+                                   int h) {
+  int r, c;
+  for (r = 0; r < h; ++r)
+    for (c = 0; c < w; ++c)
+      dst[c * (dst_stride) + r] = src[r * (src_stride) + c];
+}
+
 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             const InterpFilter *interp_filter,
@@ -272,9 +290,33 @@
     // This will reduce hardware implementation cost.
     filter_params_y = av1_get_interp_filter_params(EIGHTTAP_SHARP);
   }
-  av1_convolve_2d(src, src_stride, conv_params->dst, conv_params->dst_stride, w,
-                  h, &filter_params_x, &filter_params_y, subpel_x_q4,
-                  subpel_y_q4, conv_params);
+
+  if (filter_params_y.taps < filter_params_x.taps) {
+    uint8_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) *
+                   (MAX_SB_SIZE + MAX_FILTER_TAP - 1)];
+    int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
+    CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE];
+    int tr_dst_stride = MAX_SB_SIZE;
+    int fo_vert = filter_params_y.taps / 2 - 1;
+    int fo_horiz = filter_params_x.taps / 2 - 1;
+
+    transpose_uint8(tr_src, tr_src_stride,
+                    src - fo_vert * src_stride - fo_horiz, src_stride,
+                    w + filter_params_x.taps - 1, h + filter_params_y.taps - 1);
+    transpose_int32(tr_dst, tr_dst_stride, conv_params->dst,
+                    conv_params->dst_stride, w, h);
+
+    // horizontal and vertical parameters are swapped because of the transpose
+    av1_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride,
+                    tr_dst, tr_dst_stride, h, w, &filter_params_y,
+                    &filter_params_x, subpel_y_q4, subpel_x_q4, conv_params);
+    transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst,
+                    tr_dst_stride, h, w);
+  } else {
+    av1_convolve_2d(src, src_stride, conv_params->dst, conv_params->dst_stride,
+                    w, h, &filter_params_x, &filter_params_y, subpel_x_q4,
+                    subpel_y_q4, conv_params);
+  }
 }
 
 #endif  // CONFIG_CONVOLVE_ROUND
commit	118bf67cb61f492023da085eb7446f6d9cbd598c	[log] [tgz]
author	Angie Chiang <angiebird@google.com>	Fri Feb 03 17:12:44 2017 -0800
committer	Angie Chiang <angiebird@google.com>	Sun Feb 12 19:38:43 2017 +0000
tree	830c87e71f65d3b909710484b135e9fbea7ab492
parent	befcc42572b88c6ff983d1000fa4eddc4bb41f26 [diff]