Merge "Cache loop filter errors."

diff --git a/examples.mk b/examples.mk
index 2337d1e..a10ee52 100644
--- a/examples.mk
+++ b/examples.mk

@@ -64,6 +64,10 @@
 vp9_spatial_scalable_encoder.GUID   = 4A38598D-627D-4505-9C7B-D4020C84100D
 vp9_spatial_scalable_encoder.DESCRIPTION = Spatial Scalable Encoder
 
+ifeq ($(CONFIG_SHARED),no)
+UTILS-$(CONFIG_VP9_ENCODER)    += resize_util.c
+endif
+
 # XMA example disabled for now, not used in VP8
 #UTILS-$(CONFIG_DECODERS)    += example_xma.c
 #example_xma.GUID             = A955FC4A-73F1-44F7-135E-30D84D32F022

diff --git a/resize_util.c b/resize_util.c
new file mode 100644
index 0000000..b068f55
--- /dev/null
+++ b/resize_util.c

@@ -0,0 +1,120 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./vp9/encoder/vp9_resize.h"
+
+static void usage(char *progname) {
+  printf("Usage:\n");
+  printf("%s <input_yuv> <width>x<height> <target_width>x<target_height> ",
+         progname);
+  printf("<output_yuv> [<frames>]\n");
+}
+
+static int parse_dim(char *v, int *width, int *height) {
+  char *x = strchr(v, 'x');
+  if (x == NULL)
+    x = strchr(v, 'X');
+  if (x == NULL)
+    return 0;
+  *width = atoi(v);
+  *height = atoi(&x[1]);
+  if (*width <= 0 || *height <= 0)
+    return 0;
+  else
+    return 1;
+}
+
+int main(int argc, char *argv[]) {
+  char *fin, *fout;
+  FILE *fpin, *fpout;
+  uint8_t *inbuf, *outbuf;
+  uint8_t *inbuf_u, *outbuf_u;
+  uint8_t *inbuf_v, *outbuf_v;
+  int f, frames;
+  int width, height, target_width, target_height;
+
+  if (argc < 5) {
+    printf("Incorrect parameters:\n");
+    usage(argv[0]);
+    return 1;
+  }
+
+  fin = argv[1];
+  fout = argv[4];
+  if (!parse_dim(argv[2], &width, &height)) {
+    printf("Incorrect parameters: %s\n", argv[2]);
+    usage(argv[0]);
+    return 1;
+  }
+  if (!parse_dim(argv[3], &target_width, &target_height)) {
+    printf("Incorrect parameters: %s\n", argv[3]);
+    usage(argv[0]);
+    return 1;
+  }
+
+  fpin = fopen(fin, "rb");
+  if (fpin == NULL) {
+    printf("Can't open file %s to read\n", fin);
+    usage(argv[0]);
+    return 1;
+  }
+  fpout = fopen(fout, "wb");
+  if (fpout == NULL) {
+    printf("Can't open file %s to write\n", fout);
+    usage(argv[0]);
+    return 1;
+  }
+  if (argc >= 6)
+    frames = atoi(argv[5]);
+  else
+    frames = INT_MAX;
+
+  printf("Input size:  %dx%d\n",
+         width, height);
+  printf("Target size: %dx%d, Frames: ",
+         target_width, target_height);
+  if (frames == INT_MAX)
+    printf("All\n");
+  else
+    printf("%d\n", frames);
+
+  inbuf = (uint8_t*)malloc(width * height * 3 / 2);
+  outbuf = (uint8_t*)malloc(target_width * target_height * 3 / 2);
+  inbuf_u = inbuf + width * height;
+  inbuf_v = inbuf_u + width * height / 4;
+  outbuf_u = outbuf + target_width * target_height;
+  outbuf_v = outbuf_u + target_width * target_height / 4;
+  f = 0;
+  while (f < frames) {
+    if (fread(inbuf, width * height * 3 / 2, 1, fpin) != 1)
+      break;
+    vp9_resize_frame420(inbuf, width, inbuf_u, inbuf_v, width / 2,
+                        height, width,
+                        outbuf, target_width, outbuf_u, outbuf_v,
+                        target_width / 2,
+                        target_height, target_width);
+    fwrite(outbuf, target_width * target_height * 3 / 2, 1, fpout);
+    f++;
+  }
+  printf("%d frames processed\n", f);
+  fclose(fpin);
+  fclose(fpout);
+
+  free(inbuf);
+  free(outbuf);
+  return 0;
+}

diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon.asm
index 65d087a..baa943b 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.asm
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.asm

@@ -347,29 +347,32 @@
     ldrb                r12, [r12]
     vdup.u8             d0, r12
 
+    ; preload 8 left
+    vld1.8              d30, [r3]
+
     ; Load above 8 pixels
     vld1.64             {d2}, [r2]
 
+    vmovl.u8            q10, d30
+
     ; Compute above - ytop_left
     vsubl.u8            q3, d2, d0
 
     ; Load left row by row and compute left + (above - ytop_left)
-    vld1.u8             {d6}, [r3]
-
     ; 1st row and 2nd row
-    vdup.u8             d0, d6[0]
-    vdup.u8             d1, d6[1]
-    vaddw.s16           q1, q3, d0
-    vaddw.s16           q2, q3, d1
+    vdup.16             q0, d20[0]
+    vdup.16             q1, d20[1]
+    vadd.s16            q0, q3, q0
+    vadd.s16            q1, q3, q1
 
     ; 3rd row and 4th row
-    vdup.u8             d0, d6[2]
-    vdup.u8             d1, d6[3]
-    vaddw.s16           q8, q3, d0
-    vaddw.s16           q9, q3, d1
+    vdup.16             q8, d20[2]
+    vdup.16             q9, d20[3]
+    vadd.s16            q8, q3, q8
+    vadd.s16            q9, q3, q9
 
-    vqshrun.s16         d0, q1, #0
-    vqshrun.s16         d1, q2, #0
+    vqshrun.s16         d0, q0, #0
+    vqshrun.s16         d1, q1, #0
     vqshrun.s16         d2, q8, #0
     vqshrun.s16         d3, q9, #0
 
@@ -379,19 +382,19 @@
     vst1.64             {d3}, [r0], r1
 
     ; 5th row and 6th row
-    vdup.u8             d0, d6[4]
-    vdup.u8             d1, d6[5]
-    vaddw.s16           q1, q3, d0
-    vaddw.s16           q2, q3, d1
+    vdup.16             q0, d21[0]
+    vdup.16             q1, d21[1]
+    vadd.s16            q0, q3, q0
+    vadd.s16            q1, q3, q1
 
-    ; 7rd row and 8th row
-    vdup.u8             d0, d6[6]
-    vdup.u8             d1, d6[7]
-    vaddw.s16           q8, q3, d0
-    vaddw.s16           q9, q3, d1
+    ; 7th row and 8th row
+    vdup.16             q8, d21[2]
+    vdup.16             q9, d21[3]
+    vadd.s16            q8, q3, q8
+    vadd.s16            q9, q3, q9
 
-    vqshrun.s16         d0, q1, #0
-    vqshrun.s16         d1, q2, #0
+    vqshrun.s16         d0, q0, #0
+    vqshrun.s16         d1, q1, #0
     vqshrun.s16         d2, q8, #0
     vqshrun.s16         d3, q9, #0
 

diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c
index f15abc0..0766b51 100644
--- a/vp9/encoder/vp9_resize.c
+++ b/vp9/encoder/vp9_resize.c

@@ -16,7 +16,6 @@
 #include <string.h>
 #include "vp9/common/vp9_common.h"
 #include "vp9/encoder/vp9_resize.h"
-#include "vpx/vpx_integer.h"
 
 #define FILTER_BITS               7
 
@@ -30,8 +29,44 @@
 
 typedef int16_t interp_kernel[INTERP_TAPS];
 
-// Filters for interpolation - note this also filters integer pels.
-const interp_kernel vp9_filteredinterp_filters[(1 << SUBPEL_BITS)] = {
+// Filters for interpolation (0.5-band) - note this also filters integer pels.
+const interp_kernel vp9_filteredinterp_filters500[(1 << SUBPEL_BITS)] = {
+  {-3,  0, 35, 64, 35,  0, -3, 0},
+  {-3, -1, 34, 64, 36,  1, -3, 0},
+  {-3, -1, 32, 64, 38,  1, -3, 0},
+  {-2, -2, 31, 63, 39,  2, -3, 0},
+  {-2, -2, 29, 63, 41,  2, -3, 0},
+  {-2, -2, 28, 63, 42,  3, -4, 0},
+  {-2, -3, 27, 63, 43,  4, -4, 0},
+  {-2, -3, 25, 62, 45,  5, -4, 0},
+  {-2, -3, 24, 62, 46,  5, -4, 0},
+  {-2, -3, 23, 61, 47,  6, -4, 0},
+  {-2, -3, 21, 60, 49,  7, -4, 0},
+  {-1, -4, 20, 60, 50,  8, -4, -1},
+  {-1, -4, 19, 59, 51,  9, -4, -1},
+  {-1, -4, 17, 58, 52, 10, -4, 0},
+  {-1, -4, 16, 57, 53, 12, -4, -1},
+  {-1, -4, 15, 56, 54, 13, -4, -1},
+  {-1, -4, 14, 55, 55, 14, -4, -1},
+  {-1, -4, 13, 54, 56, 15, -4, -1},
+  {-1, -4, 12, 53, 57, 16, -4, -1},
+  {0, -4, 10, 52, 58, 17, -4, -1},
+  {-1, -4,  9, 51, 59, 19, -4, -1},
+  {-1, -4,  8, 50, 60, 20, -4, -1},
+  {0, -4,  7, 49, 60, 21, -3, -2},
+  {0, -4,  6, 47, 61, 23, -3, -2},
+  {0, -4,  5, 46, 62, 24, -3, -2},
+  {0, -4,  5, 45, 62, 25, -3, -2},
+  {0, -4,  4, 43, 63, 27, -3, -2},
+  {0, -4,  3, 42, 63, 28, -2, -2},
+  {0, -3,  2, 41, 63, 29, -2, -2},
+  {0, -3,  2, 39, 63, 31, -2, -2},
+  {0, -3,  1, 38, 64, 32, -1, -3},
+  {0, -3,  1, 36, 64, 34, -1, -3}
+};
+
+// Filters for interpolation (0.625-band) - note this also filters integer pels.
+const interp_kernel vp9_filteredinterp_filters625[(1 << SUBPEL_BITS)] = {
   {-1, -8, 33, 80, 33, -8, -1, 0},
   {-1, -8, 30, 80, 35, -8, -1, 1},
   {-1, -8, 28, 80, 37, -7, -2, 1},
@@ -66,10 +101,132 @@
   {1, -1, -8, 35, 80, 30, -8, -1},
 };
 
+// Filters for interpolation (0.75-band) - note this also filters integer pels.
+const interp_kernel vp9_filteredinterp_filters750[(1 << SUBPEL_BITS)] = {
+  {2, -11,  25,  96,  25, -11,   2, 0},
+  {2, -11,  22,  96,  28, -11,   2, 0},
+  {2, -10,  19,  95,  31, -11,   2, 0},
+  {2, -10,  17,  95,  34, -12,   2, 0},
+  {2,  -9,  14,  94,  37, -12,   2, 0},
+  {2,  -8,  12,  93,  40, -12,   1, 0},
+  {2,  -8,   9,  92,  43, -12,   1, 1},
+  {2,  -7,   7,  91,  46, -12,   1, 0},
+  {2,  -7,   5,  90,  49, -12,   1, 0},
+  {2,  -6,   3,  88,  52, -12,   0, 1},
+  {2,  -5,   1,  86,  55, -12,   0, 1},
+  {2,  -5,  -1,  84,  58, -11,   0, 1},
+  {2,  -4,  -2,  82,  61, -11,  -1, 1},
+  {2,  -4,  -4,  80,  64, -10,  -1, 1},
+  {1, -3, -5, 77, 67, -9, -1, 1},
+  {1, -3, -6, 75, 70, -8, -2, 1},
+  {1, -2, -7, 72, 72, -7, -2, 1},
+  {1, -2, -8, 70, 75, -6, -3, 1},
+  {1, -1, -9, 67, 77, -5, -3, 1},
+  {1,  -1, -10,  64,  80,  -4,  -4, 2},
+  {1,  -1, -11,  61,  82,  -2,  -4, 2},
+  {1,   0, -11,  58,  84,  -1,  -5, 2},
+  {1,   0, -12,  55,  86,   1,  -5, 2},
+  {1,   0, -12,  52,  88,   3,  -6, 2},
+  {0,   1, -12,  49,  90,   5,  -7, 2},
+  {0,   1, -12,  46,  91,   7,  -7, 2},
+  {1,   1, -12,  43,  92,   9,  -8, 2},
+  {0,   1, -12,  40,  93,  12,  -8, 2},
+  {0,   2, -12,  37,  94,  14,  -9, 2},
+  {0,   2, -12,  34,  95,  17, -10, 2},
+  {0,   2, -11,  31,  95,  19, -10, 2},
+  {0,   2, -11,  28,  96,  22, -11, 2}
+};
+
+// Filters for interpolation (0.875-band) - note this also filters integer pels.
+const interp_kernel vp9_filteredinterp_filters875[(1 << SUBPEL_BITS)] = {
+  {3,  -8,  13, 112,  13,  -8,   3, 0},
+  {3,  -7,  10, 112,  17,  -9,   3, -1},
+  {2,  -6,   7, 111,  21,  -9,   3, -1},
+  {2,  -5,   4, 111,  24, -10,   3, -1},
+  {2,  -4,   1, 110,  28, -11,   3, -1},
+  {1,  -3,  -1, 108,  32, -12,   4, -1},
+  {1,  -2,  -3, 106,  36, -13,   4, -1},
+  {1,  -1,  -6, 105,  40, -14,   4, -1},
+  {1,  -1,  -7, 102,  44, -14,   4, -1},
+  {1,   0,  -9, 100,  48, -15,   4, -1},
+  {1,   1, -11,  97,  53, -16,   4, -1},
+  {0,   1, -12,  95,  57, -16,   4, -1},
+  {0,   2, -13,  91,  61, -16,   4, -1},
+  {0,   2, -14,  88,  65, -16,   4, -1},
+  {0,   3, -15,  84,  69, -17,   4, 0},
+  {0,   3, -16,  81,  73, -16,   3, 0},
+  {0,   3, -16,  77,  77, -16,   3, 0},
+  {0,   3, -16,  73,  81, -16,   3, 0},
+  {0,   4, -17,  69,  84, -15,   3, 0},
+  {-1,   4, -16,  65,  88, -14,   2, 0},
+  {-1,   4, -16,  61,  91, -13,   2, 0},
+  {-1,   4, -16,  57,  95, -12,   1, 0},
+  {-1,   4, -16,  53,  97, -11,   1, 1},
+  {-1,   4, -15,  48, 100,  -9,   0, 1},
+  {-1,   4, -14,  44, 102,  -7,  -1, 1},
+  {-1,   4, -14,  40, 105,  -6,  -1, 1},
+  {-1,   4, -13,  36, 106,  -3,  -2, 1},
+  {-1,   4, -12,  32, 108,  -1,  -3, 1},
+  {-1,   3, -11,  28, 110,   1,  -4, 2},
+  {-1,   3, -10,  24, 111,   4,  -5, 2},
+  {-1,   3,  -9,  21, 111,   7,  -6, 2},
+  {-1,   3,  -9,  17, 112,  10,  -7, 3}
+};
+
+// Filters for interpolation (full-band) - no filtering for integer pixels
+const interp_kernel vp9_filteredinterp_filters1000[(1 << SUBPEL_BITS)] = {
+  {0,   0,   0, 128,   0,   0,   0, 0},
+  {0,   1,  -3, 128,   3,  -1,   0, 0},
+  {-1,   2,  -6, 127,   7,  -2,   1, 0},
+  {-1,   3,  -9, 126,  12,  -4,   1, 0},
+  {-1,   4, -12, 125,  16,  -5,   1, 0},
+  {-1,   4, -14, 123,  20,  -6,   2, 0},
+  {-1,   5, -15, 120,  25,  -8,   2, 0},
+  {-1,   5, -17, 118,  30,  -9,   3, -1},
+  {-1,   6, -18, 114,  35, -10,   3, -1},
+  {-1,   6, -19, 111,  41, -12,   3, -1},
+  {-1,   6, -20, 107,  46, -13,   4, -1},
+  {-1,   6, -21, 103,  52, -14,   4, -1},
+  {-1,   6, -21,  99,  57, -16,   5, -1},
+  {-1,   6, -21,  94,  63, -17,   5, -1},
+  {-1,   6, -20,  89,  68, -18,   5, -1},
+  {-1,   6, -20,  84,  73, -19,   6, -1},
+  {-1,   6, -20,  79,  79, -20,   6, -1},
+  {-1,   6, -19,  73,  84, -20,   6, -1},
+  {-1,   5, -18,  68,  89, -20,   6, -1},
+  {-1,   5, -17,  63,  94, -21,   6, -1},
+  {-1,   5, -16,  57,  99, -21,   6, -1},
+  {-1,   4, -14,  52, 103, -21,   6, -1},
+  {-1,   4, -13,  46, 107, -20,   6, -1},
+  {-1,   3, -12,  41, 111, -19,   6, -1},
+  {-1,   3, -10,  35, 114, -18,   6, -1},
+  {-1,   3,  -9,  30, 118, -17,   5, -1},
+  {0,   2,  -8,  25, 120, -15,   5, -1},
+  {0,   2,  -6,  20, 123, -14,   4, -1},
+  {0,   1,  -5,  16, 125, -12,   4, -1},
+  {0,   1,  -4,  12, 126,  -9,   3, -1},
+  {0,   1,  -2,   7, 127,  -6,   2, -1},
+  {0,   0,  -1,   3, 128,  -3,   1, 0}
+};
+
 // Filters for factor of 2 downsampling.
 static const int16_t vp9_down2_symeven_half_filter[] = {56, 12, -3, -1};
 static const int16_t vp9_down2_symodd_half_filter[] = {64, 35, 0, -3};
 
+static const interp_kernel *choose_interp_filter(int inlength, int outlength) {
+  int outlength16 = outlength * 16;
+  if (outlength16 >= inlength * 16)
+    return vp9_filteredinterp_filters1000;
+  else if (outlength16 >= inlength * 13)
+    return vp9_filteredinterp_filters875;
+  else if (outlength16 >= inlength * 11)
+    return vp9_filteredinterp_filters750;
+  else if (outlength16 >= inlength * 9)
+    return vp9_filteredinterp_filters625;
+  else
+    return vp9_filteredinterp_filters500;
+}
+
 static void interpolate(const uint8_t *const input, int inlength,
                         uint8_t *output, int outlength) {
   const int64_t delta = (((uint64_t)inlength << 32) + outlength / 2) /
@@ -81,6 +238,9 @@
   int x, x1, x2, sum, k, int_pel, sub_pel;
   int64_t y;
 
+  const interp_kernel *interp_filters =
+      choose_interp_filter(inlength, outlength);
+
   x = 0;
   y = offset;
   while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) {
@@ -101,7 +261,7 @@
       const int16_t *filter;
       int_pel = y >> INTERP_PRECISION_BITS;
       sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
-      filter = vp9_filteredinterp_filters[sub_pel];
+      filter = interp_filters[sub_pel];
       sum = 0;
       for (k = 0; k < INTERP_TAPS; ++k) {
         const int pk = int_pel - INTERP_TAPS / 2 + 1 + k;
@@ -116,7 +276,7 @@
       const int16_t *filter;
       int_pel = y >> INTERP_PRECISION_BITS;
       sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
-      filter = vp9_filteredinterp_filters[sub_pel];
+      filter = interp_filters[sub_pel];
       sum = 0;
       for (k = 0; k < INTERP_TAPS; ++k)
         sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 ?
@@ -129,7 +289,7 @@
       const int16_t *filter;
       int_pel = y >> INTERP_PRECISION_BITS;
       sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
-      filter = vp9_filteredinterp_filters[sub_pel];
+      filter = interp_filters[sub_pel];
       sum = 0;
       for (k = 0; k < INTERP_TAPS; ++k)
         sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k];
@@ -140,7 +300,7 @@
       const int16_t *filter;
       int_pel = y >> INTERP_PRECISION_BITS;
       sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
-      filter = vp9_filteredinterp_filters[sub_pel];
+      filter = interp_filters[sub_pel];
       sum = 0;
       for (k = 0; k < INTERP_TAPS; ++k)
         sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >=

diff --git a/vp9/encoder/vp9_resize.h b/vp9/encoder/vp9_resize.h
index c67595a..1818cd4 100644
--- a/vp9/encoder/vp9_resize.h
+++ b/vp9/encoder/vp9_resize.h

@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -12,6 +12,7 @@
 #define VP9_ENCODER_VP9_RESIZE_H_
 
 #include <stdio.h>
+#include "vpx/vpx_integer.h"
 
 void vp9_resize_plane(const uint8_t *const input,
                       int height,