Use 256-byte aligned filter tables

This avoids duplicating all the filters twice. Includes fixups to the
convolve routines and associated tests to make this work.

Change-Id: I922f86021594e55072ddb63b42b2313605db6e00
diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
index f1b5915..b062e7d 100644
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -19,7 +19,6 @@
 
 #define VP9_FILTER_WEIGHT 128
 #define VP9_FILTER_SHIFT  7
-#define ALIGN_FILTERS_256 0
 
 /* Assume a bank of 16 filters to choose from. There are two implementations
  * for filter wrapping behavior, since we want to be able to pick which filter
@@ -34,8 +33,11 @@
  *    always 256 byte aligned.
  *
  * Implementations 2 and 3 are likely preferable, as they avoid an extra 2
- * parameters, and switching between them is trivial.
+ * parameters, and switching between them is trivial, with the
+ * ALIGN_FILTERS_256 macro, below.
  */
+ #define ALIGN_FILTERS_256 1
+
 static void convolve_horiz_c(const uint8_t *src, int src_stride,
                              uint8_t *dst, int dst_stride,
                              const int16_t *filter_x0, int x_step_q4,
@@ -56,11 +58,12 @@
     const int16_t *filter_x = filter_x0;
 
     /* Initial phase offset */
-    int x_q4 = (filter_x - filter_x_base) / taps;
+    int x0_q4 = (filter_x - filter_x_base) / taps;
+    int x_q4 = x0_q4;
 
     for (x = 0; x < w; ++x) {
       /* Per-pixel src offset */
-      int src_x = x_q4 >> 4;
+      int src_x = (x_q4 - x0_q4) >> 4;
 
       for (sum = 0, k = 0; k < taps; ++k) {
         sum += src[src_x + k] * filter_x[k];
@@ -97,11 +100,12 @@
     const int16_t *filter_x = filter_x0;
 
     /* Initial phase offset */
-    int x_q4 = (filter_x - filter_x_base) / taps;
+    int x0_q4 = (filter_x - filter_x_base) / taps;
+    int x_q4 = x0_q4;
 
     for (x = 0; x < w; ++x) {
       /* Per-pixel src offset */
-      int src_x = x_q4 >> 4;
+      int src_x = (x_q4 - x0_q4) >> 4;
 
       for (sum = 0, k = 0; k < taps; ++k) {
         sum += src[src_x + k] * filter_x[k];
@@ -138,11 +142,12 @@
     const int16_t *filter_y = filter_y0;
 
     /* Initial phase offset */
-    int y_q4 = (filter_y - filter_y_base) / taps;
+    int y0_q4 = (filter_y - filter_y_base) / taps;
+    int y_q4 = y0_q4;
 
     for (y = 0; y < h; ++y) {
       /* Per-pixel src offset */
-      int src_y = y_q4 >> 4;
+      int src_y = (y_q4 - y0_q4) >> 4;
 
       for (sum = 0, k = 0; k < taps; ++k) {
         sum += src[(src_y + k) * src_stride] * filter_y[k];
@@ -179,11 +184,12 @@
     const int16_t *filter_y = filter_y0;
 
     /* Initial phase offset */
-    int y_q4 = (filter_y - filter_y_base) / taps;
+    int y0_q4 = (filter_y - filter_y_base) / taps;
+    int y_q4 = y0_q4;
 
     for (y = 0; y < h; ++y) {
       /* Per-pixel src offset */
-      int src_y = y_q4 >> 4;
+      int src_y = (y_q4 - y0_q4) >> 4;
 
       for (sum = 0, k = 0; k < taps; ++k) {
         sum += src[(src_y + k) * src_stride] * filter_y[k];