Merge "vpxdec: restoring old md5 behavior for y4m files."

diff --git a/build/make/obj_int_extract.c b/build/make/obj_int_extract.c
index 495e9d7..dbd8c9e 100644
--- a/build/make/obj_int_extract.c
+++ b/build/make/obj_int_extract.c

@@ -21,6 +21,7 @@
   OUTPUT_FMT_PLAIN,
   OUTPUT_FMT_RVDS,
   OUTPUT_FMT_GAS,
+  OUTPUT_FMT_C_HEADER,
 } output_fmt_t;
 
 int log_msg(const char *fmt, ...) {
@@ -46,6 +47,9 @@
     case  OUTPUT_FMT_GAS:
       printf(".set %-40s, %5d\n", name, val);
       return 0;
+    case OUTPUT_C_HEADER:
+      printf("#define %-40s %5d\n", name, val);
+      return 0;
     default:
       log_msg("Unsupported mode: %d", mode);
       return 1;
@@ -491,6 +495,13 @@
                                               sym.st_name),
                        val);
                 break;
+              case OUTPUT_FMT_C_HEADER:
+                printf("#define %-40s %5d\n",
+                       parse_elf_string_table(&elf,
+                                              shdr.sh_link,
+                                              sym.st_name),
+                       val);
+                break;
               default:
                 printf("%s = %d\n",
                        parse_elf_string_table(&elf,
@@ -762,6 +773,7 @@
     fprintf(stderr, "Output Formats:\n");
     fprintf(stderr, "  gas  - compatible with GNU assembler\n");
     fprintf(stderr, "  rvds - compatible with armasm\n");
+    fprintf(stderr, "  cheader - c/c++ header file\n");
     goto bail;
   }
 
@@ -771,6 +783,8 @@
     mode = OUTPUT_FMT_RVDS;
   else if (!strcmp(argv[1], "gas"))
     mode = OUTPUT_FMT_GAS;
+  else if (!strcmp(argv[1], "cheader"))
+    mode = OUTPUT_FMT_C_HEADER;
   else
     f = argv[1];
 

diff --git a/examples.mk b/examples.mk
index 2337d1e..6940353 100644
--- a/examples.mk
+++ b/examples.mk

@@ -64,14 +64,20 @@
 vp9_spatial_scalable_encoder.GUID   = 4A38598D-627D-4505-9C7B-D4020C84100D
 vp9_spatial_scalable_encoder.DESCRIPTION = Spatial Scalable Encoder
 
+ifeq ($(CONFIG_SHARED),no)
+UTILS-$(CONFIG_VP9_ENCODER)    += resize_util.c
+endif
+
 # XMA example disabled for now, not used in VP8
 #UTILS-$(CONFIG_DECODERS)    += example_xma.c
 #example_xma.GUID             = A955FC4A-73F1-44F7-135E-30D84D32F022
 #example_xma.DESCRIPTION      = External Memory Allocation mode usage
 
 GEN_EXAMPLES-$(CONFIG_VP8_DECODER) += simple_decoder.c
-simple_decoder.GUID              = D3BBF1E9-2427-450D-BBFF-B2843C1D44CC
-simple_decoder.DESCRIPTION       = Simplified decoder loop
+simple_decoder.GUID                = D3BBF1E9-2427-450D-BBFF-B2843C1D44CC
+simple_decoder.SRCS                += ivfdec.h ivfdec.c
+simple_decoder.SRCS                += tools_common.h tools_common.c
+simple_decoder.DESCRIPTION         = Simplified decoder loop
 GEN_EXAMPLES-$(CONFIG_VP8_DECODER) += postproc.c
 postproc.GUID                    = 65E33355-F35E-4088-884D-3FD4905881D7
 postproc.DESCRIPTION             = Decoder postprocessor control

diff --git a/examples/decode_to_md5.c b/examples/decode_to_md5.c
index 67cbd6c..bba2182 100644
--- a/examples/decode_to_md5.c
+++ b/examples/decode_to_md5.c

@@ -29,7 +29,6 @@
 // is processed, then U, then V. It is important to honor the image's `stride`
 // values.
 
-#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -44,28 +43,6 @@
 #include "./tools_common.h"
 #include "./vpx_config.h"
 
-#define VP8_FOURCC 0x30385056
-#define VP9_FOURCC 0x30395056
-
-static vpx_codec_iface_t *get_codec_interface(unsigned int fourcc) {
-  switch (fourcc) {
-    case VP8_FOURCC:
-      return vpx_codec_vp8_dx();
-    case VP9_FOURCC:
-      return vpx_codec_vp9_dx();
-  }
-  return NULL;
-}
-
-static void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
-  const char *detail = vpx_codec_error_detail(ctx);
-
-  printf("%s: %s\n", s, vpx_codec_error(ctx));
-  if(detail)
-    printf("    %s\n",detail);
-  exit(EXIT_FAILURE);
-}
-
 static void get_image_md5(const vpx_image_t *img, unsigned char digest[16]) {
   int plane, y;
   MD5Context md5;

diff --git a/examples/simple_decoder.c b/examples/simple_decoder.c
index 17a5987..23399f4 100644
--- a/examples/simple_decoder.c
+++ b/examples/simple_decoder.c

@@ -77,110 +77,82 @@
 // few exeptions, vpx_codec functions return an enumerated error status,
 // with the value `0` indicating success.
 
-#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
 #define VPX_CODEC_DISABLE_COMPAT 1
-#include "./vpx_config.h"
+
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
-#define interface (vpx_codec_vp8_dx())
 
+#include "./ivfdec.h"
+#include "./tools_common.h"
+#include "./vpx_config.h"
 
-#define IVF_FILE_HDR_SZ  (32)
-#define IVF_FRAME_HDR_SZ (12)
+static const char *exec_name;
 
-static unsigned int mem_get_le32(const unsigned char *mem) {
-    return (mem[3] << 24)|(mem[2] << 16)|(mem[1] << 8)|(mem[0]);
+void usage_exit() {
+  fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name);
+  exit(EXIT_FAILURE);
 }
 
-static void die(const char *fmt, ...) {
-    va_list ap;
-
-    va_start(ap, fmt);
-    vprintf(fmt, ap);
-    if(fmt[strlen(fmt)-1] != '\n')
-        printf("\n");
-    exit(EXIT_FAILURE);
-}
-
-static void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
-    const char *detail = vpx_codec_error_detail(ctx);
-
-    printf("%s: %s\n", s, vpx_codec_error(ctx));
-    if(detail)
-        printf("    %s\n",detail);
-    exit(EXIT_FAILURE);
-}
-
-
 int main(int argc, char **argv) {
-    FILE            *infile, *outfile;
-    vpx_codec_ctx_t  codec;
-    int              flags = 0, frame_cnt = 0;
-    unsigned char    file_hdr[IVF_FILE_HDR_SZ];
-    unsigned char    frame_hdr[IVF_FRAME_HDR_SZ];
-    unsigned char    frame[256*1024];
-    vpx_codec_err_t  res;
+  FILE *infile, *outfile;
+  vpx_codec_ctx_t codec;
+  vpx_codec_iface_t *iface;
+  int flags = 0, frame_cnt = 0;
+  vpx_video_t *video;
 
-    (void)res;
-    /* Open files */
-    if(argc!=3)
-        die("Usage: %s <infile> <outfile>\n", argv[0]);
-    if(!(infile = fopen(argv[1], "rb")))
-        die("Failed to open %s for reading", argv[1]);
-    if(!(outfile = fopen(argv[2], "wb")))
-        die("Failed to open %s for writing", argv[2]);
+  exec_name = argv[0];
 
-    /* Read file header */
-    if(!(fread(file_hdr, 1, IVF_FILE_HDR_SZ, infile) == IVF_FILE_HDR_SZ
-         && file_hdr[0]=='D' && file_hdr[1]=='K' && file_hdr[2]=='I'
-         && file_hdr[3]=='F'))
-        die("%s is not an IVF file.", argv[1]);
+  if (argc != 3)
+    die("Invalid number of arguments");
 
-    printf("Using %s\n",vpx_codec_iface_name(interface));
-    /* Initialize codec */
-    if(vpx_codec_dec_init(&codec, interface, NULL, flags))
-        die_codec(&codec, "Failed to initialize decoder");
+  if (!(infile = fopen(argv[1], "rb")))
+    die("Failed to open %s for reading", argv[1]);
 
-    /* Read each frame */
-    while(fread(frame_hdr, 1, IVF_FRAME_HDR_SZ, infile) == IVF_FRAME_HDR_SZ) {
-        int               frame_sz = mem_get_le32(frame_hdr);
-        vpx_codec_iter_t  iter = NULL;
-        vpx_image_t      *img;
+  if (!(outfile = fopen(argv[2], "wb")))
+    die("Failed to open %s for writing", argv[2]);
 
+  video = vpx_video_open_file(infile);
+  if (!video)
+    die("%s is not an IVF file.", argv[1]);
 
-        frame_cnt++;
-        if(frame_sz > sizeof(frame))
-            die("Frame %d data too big for example code buffer", frame_sz);
-        if(fread(frame, 1, frame_sz, infile) != frame_sz)
-            die("Frame %d failed to read complete frame", frame_cnt);
+  iface = get_codec_interface(vpx_video_get_fourcc(video));
+  if (!iface)
+    die("Unknown FOURCC code.");
 
-        /* Decode the frame */
-        if(vpx_codec_decode(&codec, frame, frame_sz, NULL, 0))
-            die_codec(&codec, "Failed to decode frame");
+  printf("Using %s\n", vpx_codec_iface_name(iface));
 
-        /* Write decoded data to disk */
-        while((img = vpx_codec_get_frame(&codec, &iter))) {
-            unsigned int plane, y;
+  if (vpx_codec_dec_init(&codec, iface, NULL, flags))
+    die_codec(&codec, "Failed to initialize decoder");
 
-            for(plane=0; plane < 3; plane++) {
-                unsigned char *buf =img->planes[plane];
+  while (vpx_video_read_frame(video)) {
+    vpx_codec_iter_t iter = NULL;
+    vpx_image_t *img = NULL;
+    size_t frame_size = 0;
+    const unsigned char *frame = vpx_video_get_frame(video, &frame_size);
+    if (vpx_codec_decode(&codec, frame, frame_size, NULL, 0))
+      die_codec(&codec, "Failed to decode frame");
 
-                for(y=0; y < (plane ? (img->d_h + 1) >> 1 : img->d_h); y++) {
-                    (void) fwrite(buf, 1, (plane ? (img->d_w + 1) >> 1 : img->d_w),
-                                  outfile);
-                    buf += img->stride[plane];
-                }
-            }
-        }
+    while ((img = vpx_codec_get_frame(&codec, &iter)) != NULL) {
+      vpx_img_write(img, outfile);
+      ++frame_cnt;
     }
-    printf("Processed %d frames.\n",frame_cnt);
-    if(vpx_codec_destroy(&codec))
-        die_codec(&codec, "Failed to destroy codec");
+  }
 
-    fclose(outfile);
-    fclose(infile);
-    return EXIT_SUCCESS;
+  printf("Processed %d frames.\n", frame_cnt);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec");
+
+  printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n",
+         vpx_video_get_width(video), vpx_video_get_height(video), argv[2]);
+
+  vpx_video_close(video);
+
+  fclose(outfile);
+  fclose(infile);
+
+  return EXIT_SUCCESS;
 }

diff --git a/resize_util.c b/resize_util.c
new file mode 100644
index 0000000..b068f55
--- /dev/null
+++ b/resize_util.c

@@ -0,0 +1,120 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./vp9/encoder/vp9_resize.h"
+
+static void usage(char *progname) {
+  printf("Usage:\n");
+  printf("%s <input_yuv> <width>x<height> <target_width>x<target_height> ",
+         progname);
+  printf("<output_yuv> [<frames>]\n");
+}
+
+static int parse_dim(char *v, int *width, int *height) {
+  char *x = strchr(v, 'x');
+  if (x == NULL)
+    x = strchr(v, 'X');
+  if (x == NULL)
+    return 0;
+  *width = atoi(v);
+  *height = atoi(&x[1]);
+  if (*width <= 0 || *height <= 0)
+    return 0;
+  else
+    return 1;
+}
+
+int main(int argc, char *argv[]) {
+  char *fin, *fout;
+  FILE *fpin, *fpout;
+  uint8_t *inbuf, *outbuf;
+  uint8_t *inbuf_u, *outbuf_u;
+  uint8_t *inbuf_v, *outbuf_v;
+  int f, frames;
+  int width, height, target_width, target_height;
+
+  if (argc < 5) {
+    printf("Incorrect parameters:\n");
+    usage(argv[0]);
+    return 1;
+  }
+
+  fin = argv[1];
+  fout = argv[4];
+  if (!parse_dim(argv[2], &width, &height)) {
+    printf("Incorrect parameters: %s\n", argv[2]);
+    usage(argv[0]);
+    return 1;
+  }
+  if (!parse_dim(argv[3], &target_width, &target_height)) {
+    printf("Incorrect parameters: %s\n", argv[3]);
+    usage(argv[0]);
+    return 1;
+  }
+
+  fpin = fopen(fin, "rb");
+  if (fpin == NULL) {
+    printf("Can't open file %s to read\n", fin);
+    usage(argv[0]);
+    return 1;
+  }
+  fpout = fopen(fout, "wb");
+  if (fpout == NULL) {
+    printf("Can't open file %s to write\n", fout);
+    usage(argv[0]);
+    return 1;
+  }
+  if (argc >= 6)
+    frames = atoi(argv[5]);
+  else
+    frames = INT_MAX;
+
+  printf("Input size:  %dx%d\n",
+         width, height);
+  printf("Target size: %dx%d, Frames: ",
+         target_width, target_height);
+  if (frames == INT_MAX)
+    printf("All\n");
+  else
+    printf("%d\n", frames);
+
+  inbuf = (uint8_t*)malloc(width * height * 3 / 2);
+  outbuf = (uint8_t*)malloc(target_width * target_height * 3 / 2);
+  inbuf_u = inbuf + width * height;
+  inbuf_v = inbuf_u + width * height / 4;
+  outbuf_u = outbuf + target_width * target_height;
+  outbuf_v = outbuf_u + target_width * target_height / 4;
+  f = 0;
+  while (f < frames) {
+    if (fread(inbuf, width * height * 3 / 2, 1, fpin) != 1)
+      break;
+    vp9_resize_frame420(inbuf, width, inbuf_u, inbuf_v, width / 2,
+                        height, width,
+                        outbuf, target_width, outbuf_u, outbuf_v,
+                        target_width / 2,
+                        target_height, target_width);
+    fwrite(outbuf, target_width * target_height * 3 / 2, 1, fpout);
+    f++;
+  }
+  printf("%d frames processed\n", f);
+  fclose(fpin);
+  fclose(fpout);
+
+  free(inbuf);
+  free(outbuf);
+  return 0;
+}

diff --git a/tools_common.c b/tools_common.c
index 9c24983..85bedc9 100644
--- a/tools_common.c
+++ b/tools_common.c

@@ -15,6 +15,10 @@
 #include <stdlib.h>
 #include <string.h>
 
+#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
+#include "vpx/vp8dx.h"
+#endif
+
 #if defined(_WIN32) || defined(__OS2__)
 #include <io.h>
 #include <fcntl.h>
@@ -60,6 +64,15 @@
   LOG_ERROR("Warning");
 }
 
+void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
+  const char *detail = vpx_codec_error_detail(ctx);
+
+  printf("%s: %s\n", s, vpx_codec_error(ctx));
+  if (detail)
+    printf("    %s\n", detail);
+  exit(EXIT_FAILURE);
+}
+
 uint16_t mem_get_le16(const void *data) {
   uint16_t val;
   const uint8_t *mem = (const uint8_t*)data;
@@ -130,3 +143,34 @@
 
   return shortread;
 }
+
+vpx_codec_iface_t *get_codec_interface(unsigned int fourcc) {
+  switch (fourcc) {
+#if CONFIG_VP8_DECODER
+    case VP8_FOURCC:
+      return vpx_codec_vp8_dx();
+#endif
+#if CONFIG_VP9_DECODER
+    case VP9_FOURCC:
+      return vpx_codec_vp9_dx();
+#endif
+    default:
+      return NULL;
+  }
+  return NULL;
+}
+
+void vpx_img_write(const vpx_image_t *img, FILE *file) {
+  int plane, y;
+
+  for (plane = 0; plane < 3; ++plane) {
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int w = plane ? (img->d_w + 1) >> 1 : img->d_w;
+    const int h = plane ? (img->d_h + 1) >> 1 : img->d_h;
+    for (y = 0; y < h; ++y) {
+      fwrite(buf, 1, w, file);
+      buf += stride;
+    }
+  }
+}

diff --git a/tools_common.h b/tools_common.h
index 1d70ab5..967b7a1 100644
--- a/tools_common.h
+++ b/tools_common.h

@@ -13,6 +13,7 @@
 #include <stdio.h>
 
 #include "./vpx_config.h"
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_image.h"
 #include "vpx/vpx_integer.h"
 
@@ -112,6 +113,8 @@
 void fatal(const char *fmt, ...);
 void warn(const char *fmt, ...);
 
+void die_codec(vpx_codec_ctx_t *ctx, const char *s);
+
 /* The tool including this file must define usage_exit() */
 void usage_exit();
 
@@ -120,6 +123,12 @@
 
 int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame);
 
+vpx_codec_iface_t *get_codec_interface(unsigned int fourcc);
+
+// TODO(dkovalev): move this function to vpx_image.{c, h}, so it will be part
+// of vpx_image_t support
+void vpx_img_write(const vpx_image_t *img, FILE *file);
+
 #ifdef __cplusplus
 }  /* extern "C" */
 #endif

diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon.asm
index 65d087a..279f678 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.asm
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.asm

@@ -19,6 +19,7 @@
     EXPORT  |vp9_tm_predictor_4x4_neon|
     EXPORT  |vp9_tm_predictor_8x8_neon|
     EXPORT  |vp9_tm_predictor_16x16_neon|
+    EXPORT  |vp9_tm_predictor_32x32_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -347,29 +348,32 @@
     ldrb                r12, [r12]
     vdup.u8             d0, r12
 
+    ; preload 8 left
+    vld1.8              d30, [r3]
+
     ; Load above 8 pixels
     vld1.64             {d2}, [r2]
 
+    vmovl.u8            q10, d30
+
     ; Compute above - ytop_left
     vsubl.u8            q3, d2, d0
 
     ; Load left row by row and compute left + (above - ytop_left)
-    vld1.u8             {d6}, [r3]
-
     ; 1st row and 2nd row
-    vdup.u8             d0, d6[0]
-    vdup.u8             d1, d6[1]
-    vaddw.s16           q1, q3, d0
-    vaddw.s16           q2, q3, d1
+    vdup.16             q0, d20[0]
+    vdup.16             q1, d20[1]
+    vadd.s16            q0, q3, q0
+    vadd.s16            q1, q3, q1
 
     ; 3rd row and 4th row
-    vdup.u8             d0, d6[2]
-    vdup.u8             d1, d6[3]
-    vaddw.s16           q8, q3, d0
-    vaddw.s16           q9, q3, d1
+    vdup.16             q8, d20[2]
+    vdup.16             q9, d20[3]
+    vadd.s16            q8, q3, q8
+    vadd.s16            q9, q3, q9
 
-    vqshrun.s16         d0, q1, #0
-    vqshrun.s16         d1, q2, #0
+    vqshrun.s16         d0, q0, #0
+    vqshrun.s16         d1, q1, #0
     vqshrun.s16         d2, q8, #0
     vqshrun.s16         d3, q9, #0
 
@@ -379,19 +383,19 @@
     vst1.64             {d3}, [r0], r1
 
     ; 5th row and 6th row
-    vdup.u8             d0, d6[4]
-    vdup.u8             d1, d6[5]
-    vaddw.s16           q1, q3, d0
-    vaddw.s16           q2, q3, d1
+    vdup.16             q0, d21[0]
+    vdup.16             q1, d21[1]
+    vadd.s16            q0, q3, q0
+    vadd.s16            q1, q3, q1
 
-    ; 7rd row and 8th row
-    vdup.u8             d0, d6[6]
-    vdup.u8             d1, d6[7]
-    vaddw.s16           q8, q3, d0
-    vaddw.s16           q9, q3, d1
+    ; 7th row and 8th row
+    vdup.16             q8, d21[2]
+    vdup.16             q9, d21[3]
+    vadd.s16            q8, q3, q8
+    vadd.s16            q9, q3, q9
 
-    vqshrun.s16         d0, q1, #0
-    vqshrun.s16         d1, q2, #0
+    vqshrun.s16         d0, q0, #0
+    vqshrun.s16         d1, q1, #0
     vqshrun.s16         d2, q8, #0
     vqshrun.s16         d3, q9, #0
 
@@ -499,4 +503,134 @@
     bx                  lr
     ENDP                ; |vp9_tm_predictor_16x16_neon|
 
+;void vp9_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
+;                                  const uint8_t *above,
+;                                  const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vp9_tm_predictor_32x32_neon| PROC
+    ; Load ytop_left = above[-1];
+    sub                 r12, r2, #1
+    ldrb                r12, [r12]
+    vdup.u8             q0, r12
+
+    ; Load above 32 pixels
+    vld1.8              q1, [r2]!
+    vld1.8              q2, [r2]
+
+    ; preload 8 left pixels
+    vld1.8              d26, [r3]!
+
+    ; Compute above - ytop_left
+    vsubl.u8            q8, d2, d0
+    vsubl.u8            q9, d3, d1
+    vsubl.u8            q10, d4, d0
+    vsubl.u8            q11, d5, d1
+
+    vmovl.u8            q3, d26
+
+    ; Load left row by row and compute left + (above - ytop_left)
+    ; Process 8 rows in each single loop and loop 4 times to process 32 rows.
+    mov                 r2, #4
+
+loop_32x32_neon
+    ; Process two rows.
+    vdup.16             q0, d6[0]
+    vdup.16             q2, d6[1]
+    vadd.s16            q12, q0, q8
+    vadd.s16            q13, q0, q9
+    vadd.s16            q14, q0, q10
+    vadd.s16            q15, q0, q11
+    vqshrun.s16         d0, q12, #0
+    vqshrun.s16         d1, q13, #0
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqshrun.s16         d2, q14, #0
+    vqshrun.s16         d3, q15, #0
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqshrun.s16         d24, q12, #0
+    vqshrun.s16         d25, q13, #0
+    vqshrun.s16         d26, q14, #0
+    vqshrun.s16         d27, q15, #0
+    vdup.16             q1, d6[2]
+    vdup.16             q2, d6[3]
+    vst1.64             {d24-d27}, [r0], r1
+
+    ; Process two rows.
+    vadd.s16            q12, q1, q8
+    vadd.s16            q13, q1, q9
+    vadd.s16            q14, q1, q10
+    vadd.s16            q15, q1, q11
+    vqshrun.s16         d0, q12, #0
+    vqshrun.s16         d1, q13, #0
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqshrun.s16         d2, q14, #0
+    vqshrun.s16         d3, q15, #0
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqshrun.s16         d24, q12, #0
+    vqshrun.s16         d25, q13, #0
+    vqshrun.s16         d26, q14, #0
+    vqshrun.s16         d27, q15, #0
+    vdup.16             q0, d7[0]
+    vdup.16             q2, d7[1]
+    vst1.64             {d24-d27}, [r0], r1
+
+    ; Process two rows.
+    vadd.s16            q12, q0, q8
+    vadd.s16            q13, q0, q9
+    vadd.s16            q14, q0, q10
+    vadd.s16            q15, q0, q11
+    vqshrun.s16         d0, q12, #0
+    vqshrun.s16         d1, q13, #0
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqshrun.s16         d2, q14, #0
+    vqshrun.s16         d3, q15, #0
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqshrun.s16         d24, q12, #0
+    vqshrun.s16         d25, q13, #0
+    vqshrun.s16         d26, q14, #0
+    vqshrun.s16         d27, q15, #0
+    vdup.16             q0, d7[2]
+    vdup.16             q2, d7[3]
+    vst1.64             {d24-d27}, [r0], r1
+
+    ; Process two rows.
+    vadd.s16            q12, q0, q8
+    vadd.s16            q13, q0, q9
+    vadd.s16            q14, q0, q10
+    vadd.s16            q15, q0, q11
+    vqshrun.s16         d0, q12, #0
+    vqshrun.s16         d1, q13, #0
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqshrun.s16         d2, q14, #0
+    vqshrun.s16         d3, q15, #0
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqshrun.s16         d24, q12, #0
+    vqshrun.s16         d25, q13, #0
+    vld1.8              d0, [r3]!                   ; preload 8 left pixels
+    vqshrun.s16         d26, q14, #0
+    vqshrun.s16         d27, q15, #0
+    vmovl.u8            q3, d0
+    vst1.64             {d24-d27}, [r0], r1
+
+    subs                r2, r2, #1
+    bgt                 loop_32x32_neon
+
+    bx                  lr
+    ENDP                ; |vp9_tm_predictor_32x32_neon|
+
     END

diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 39ea492..9e16d8f 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h

@@ -252,7 +252,7 @@
   /* Inverse transform function pointers. */
   void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
 
-  struct subpix_fn_table  subpix;
+  const interp_kernel *interp_kernel;
 
   int corrupted;
 

diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h
index c8a0b61..b611e30 100644
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h

@@ -35,11 +35,6 @@
 
 typedef int16_t interp_kernel[SUBPEL_TAPS];
 
-struct subpix_fn_table {
-  const interp_kernel *filter_x;
-  const interp_kernel *filter_y;
-};
-
 const interp_kernel *vp9_get_interp_kernel(INTERP_FILTER filter);
 
 extern const interp_kernel vp9_bilinear_filters[SUBPEL_SHIFTS];

diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 533f7f3..20b78bf 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c

@@ -96,7 +96,7 @@
   }
 }
 
-static void idct4_1d(const int16_t *input, int16_t *output) {
+static void idct4(const int16_t *input, int16_t *output) {
   int16_t step[4];
   int temp1, temp2;
   // stage 1
@@ -124,7 +124,7 @@
 
   // Rows
   for (i = 0; i < 4; ++i) {
-    idct4_1d(input, outptr);
+    idct4(input, outptr);
     input += 4;
     outptr += 4;
   }
@@ -133,7 +133,7 @@
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j)
       temp_in[j] = out[j * 4 + i];
-    idct4_1d(temp_in, temp_out);
+    idct4(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
                                   + dest[j * stride + i]);
@@ -156,7 +156,7 @@
   }
 }
 
-static void idct8_1d(const int16_t *input, int16_t *output) {
+static void idct8(const int16_t *input, int16_t *output) {
   int16_t step1[8], step2[8];
   int temp1, temp2;
   // stage 1
@@ -174,7 +174,7 @@
   step1[6] = dct_const_round_shift(temp2);
 
   // stage 2 & stage 3 - even half
-  idct4_1d(step1, step1);
+  idct4(step1, step1);
 
   // stage 2 - odd half
   step2[4] = step1[4] + step1[5];
@@ -209,7 +209,7 @@
 
   // First transform rows
   for (i = 0; i < 8; ++i) {
-    idct8_1d(input, outptr);
+    idct8(input, outptr);
     input += 8;
     outptr += 8;
   }
@@ -218,7 +218,7 @@
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j)
       temp_in[j] = out[j * 8 + i];
-    idct8_1d(temp_in, temp_out);
+    idct8(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
                                   + dest[j * stride + i]);
@@ -238,7 +238,7 @@
   }
 }
 
-static void iadst4_1d(const int16_t *input, int16_t *output) {
+static void iadst4(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
   int x0 = input[0];
@@ -283,10 +283,10 @@
 void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   const transform_2d IHT_4[] = {
-    { idct4_1d, idct4_1d  },  // DCT_DCT  = 0
-    { iadst4_1d, idct4_1d  },   // ADST_DCT = 1
-    { idct4_1d, iadst4_1d },    // DCT_ADST = 2
-    { iadst4_1d, iadst4_1d }      // ADST_ADST = 3
+    { idct4, idct4  },  // DCT_DCT  = 0
+    { iadst4, idct4  },   // ADST_DCT = 1
+    { idct4, iadst4 },    // DCT_ADST = 2
+    { iadst4, iadst4 }      // ADST_ADST = 3
   };
 
   int i, j;
@@ -311,7 +311,7 @@
                                   + dest[j * stride + i]);
   }
 }
-static void iadst8_1d(const int16_t *input, int16_t *output) {
+static void iadst8(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
   int x0 = input[7];
@@ -389,10 +389,10 @@
 }
 
 static const transform_2d IHT_8[] = {
-  { idct8_1d,  idct8_1d  },  // DCT_DCT  = 0
-  { iadst8_1d, idct8_1d  },  // ADST_DCT = 1
-  { idct8_1d,  iadst8_1d },  // DCT_ADST = 2
-  { iadst8_1d, iadst8_1d }   // ADST_ADST = 3
+  { idct8,  idct8  },  // DCT_DCT  = 0
+  { iadst8, idct8  },  // ADST_DCT = 1
+  { idct8,  iadst8 },  // DCT_ADST = 2
+  { iadst8, iadst8 }   // ADST_ADST = 3
 };
 
 void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
@@ -430,7 +430,7 @@
   // First transform rows
   // only first 4 row has non-zero coefs
   for (i = 0; i < 4; ++i) {
-    idct8_1d(input, outptr);
+    idct8(input, outptr);
     input += 8;
     outptr += 8;
   }
@@ -439,14 +439,14 @@
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j)
       temp_in[j] = out[j * 8 + i];
-    idct8_1d(temp_in, temp_out);
+    idct8(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
                                   + dest[j * stride + i]);
   }
 }
 
-static void idct16_1d(const int16_t *input, int16_t *output) {
+static void idct16(const int16_t *input, int16_t *output) {
   int16_t step1[16], step2[16];
   int temp1, temp2;
 
@@ -619,7 +619,7 @@
 
   // First transform rows
   for (i = 0; i < 16; ++i) {
-    idct16_1d(input, outptr);
+    idct16(input, outptr);
     input += 16;
     outptr += 16;
   }
@@ -628,14 +628,14 @@
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j)
       temp_in[j] = out[j * 16 + i];
-    idct16_1d(temp_in, temp_out);
+    idct16(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
                                   + dest[j * stride + i]);
   }
 }
 
-static void iadst16_1d(const int16_t *input, int16_t *output) {
+static void iadst16(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
 
   int x0 = input[15];
@@ -807,10 +807,10 @@
 }
 
 static const transform_2d IHT_16[] = {
-  { idct16_1d,  idct16_1d  },  // DCT_DCT  = 0
-  { iadst16_1d, idct16_1d  },  // ADST_DCT = 1
-  { idct16_1d,  iadst16_1d },  // DCT_ADST = 2
-  { iadst16_1d, iadst16_1d }   // ADST_ADST = 3
+  { idct16,  idct16  },  // DCT_DCT  = 0
+  { iadst16, idct16  },  // ADST_DCT = 1
+  { idct16,  iadst16 },  // DCT_ADST = 2
+  { iadst16, iadst16 }   // ADST_ADST = 3
 };
 
 void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
@@ -848,7 +848,7 @@
   // First transform rows. Since all non-zero dct coefficients are in
   // upper-left 4x4 area, we only need to calculate first 4 rows here.
   for (i = 0; i < 4; ++i) {
-    idct16_1d(input, outptr);
+    idct16(input, outptr);
     input += 16;
     outptr += 16;
   }
@@ -857,7 +857,7 @@
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j)
       temp_in[j] = out[j*16 + i];
-    idct16_1d(temp_in, temp_out);
+    idct16(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
                                   + dest[j * stride + i]);
@@ -877,7 +877,7 @@
   }
 }
 
-static void idct32_1d(const int16_t *input, int16_t *output) {
+static void idct32(const int16_t *input, int16_t *output) {
   int16_t step1[32], step2[32];
   int temp1, temp2;
 
@@ -1263,7 +1263,7 @@
       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
 
     if (zero_coeff[0] | zero_coeff[1])
-      idct32_1d(input, outptr);
+      idct32(input, outptr);
     else
       vpx_memset(outptr, 0, sizeof(int16_t) * 32);
     input += 32;
@@ -1274,7 +1274,7 @@
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j)
       temp_in[j] = out[j * 32 + i];
-    idct32_1d(temp_in, temp_out);
+    idct32(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
                                         + dest[j * stride + i]);
@@ -1290,7 +1290,7 @@
   // Rows
   // only upper-left 8x8 has non-zero coeff
   for (i = 0; i < 8; ++i) {
-    idct32_1d(input, outptr);
+    idct32(input, outptr);
     input += 32;
     outptr += 32;
   }
@@ -1299,7 +1299,7 @@
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j)
       temp_in[j] = out[j * 32 + i];
-    idct32_1d(temp_in, temp_out);
+    idct32(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
                                   + dest[j * stride + i]);

diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index b5a9248..d554cc0 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c

@@ -69,13 +69,11 @@
                             const int subpel_y,
                             const struct scale_factors *sf,
                             int w, int h, int ref,
-                            const struct subpix_fn_table *subpix,
+                            const interp_kernel *kernel,
                             int xs, int ys) {
   sf->predict[subpel_x != 0][subpel_y != 0][ref](
       src, src_stride, dst, dst_stride,
-      subpix->filter_x[subpel_x], xs,
-      subpix->filter_y[subpel_y], ys,
-      w, h);
+      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h);
 }
 
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
@@ -83,7 +81,7 @@
                                const MV *src_mv,
                                const struct scale_factors *sf,
                                int w, int h, int ref,
-                               const struct subpix_fn_table *subpix,
+                               const interp_kernel *kernel,
                                enum mv_precision precision,
                                int x, int y) {
   const int is_q4 = precision == MV_PRECISION_Q4;
@@ -96,7 +94,7 @@
   src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
 
   inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
-                  sf, w, h, ref, subpix, sf->x_step_q4, sf->y_step_q4);
+                  sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4);
 }
 
 static INLINE int round_mv_comp_q4(int value) {
@@ -198,7 +196,8 @@
            + (scaled_mv.col >> SUBPEL_BITS);
 
     inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
-                    subpel_x, subpel_y, sf, w, h, ref, &xd->subpix, xs, ys);
+                    subpel_x, subpel_y, sf, w, h, ref, xd->interp_kernel,
+                    xs, ys);
   }
 }
 
@@ -367,7 +366,7 @@
     }
 
     inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
-                    subpel_y, sf, w, h, ref, &xd->subpix, xs, ys);
+                    subpel_y, sf, w, h, ref, xd->interp_kernel, xs, ys);
   }
 }
 

diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 58f4b41..3345d83 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h

@@ -18,7 +18,6 @@
 extern "C" {
 #endif
 
-struct subpix_fn_table;
 void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
                                     BLOCK_SIZE bsize);
 
@@ -36,7 +35,7 @@
                                const MV *mv_q3,
                                const struct scale_factors *sf,
                                int w, int h, int do_avg,
-                               const struct subpix_fn_table *subpix,
+                               const interp_kernel *kernel,
                                enum mv_precision precision,
                                int x, int y);
 
@@ -94,11 +93,6 @@
   }
 }
 
-static void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd, int ref0, int ref1) {
-  xd->block_refs[0] = &cm->frame_refs[ref0 >= 0 ? ref0 : 0];
-  xd->block_refs[1] = &cm->frame_refs[ref1 >= 0 ? ref1 : 0];
-}
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 8fd6db8..66defb9 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -174,7 +174,7 @@
 specialize vp9_v_predictor_32x32 $sse2_x86inc neon
 
 prototype void vp9_tm_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_32x32 $sse2_x86_64
+specialize vp9_tm_predictor_32x32 $sse2_x86_64 neon
 
 prototype void vp9_dc_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_predictor_32x32 $sse2_x86inc

diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 2f61494..13a5b5a 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c

@@ -180,7 +180,7 @@
   res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
 }
 
-static void idct4_1d_sse2(__m128i *in) {
+static void idct4_sse2(__m128i *in) {
   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
@@ -216,7 +216,7 @@
   in[1] = _mm_shuffle_epi32(in[1], 0x4E);
 }
 
-static void iadst4_1d_sse2(__m128i *in) {
+static void iadst4_sse2(__m128i *in) {
   const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
   const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
   const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
@@ -276,20 +276,20 @@
 
   switch (tx_type) {
     case 0:  // DCT_DCT
-      idct4_1d_sse2(in);
-      idct4_1d_sse2(in);
+      idct4_sse2(in);
+      idct4_sse2(in);
       break;
     case 1:  // ADST_DCT
-      idct4_1d_sse2(in);
-      iadst4_1d_sse2(in);
+      idct4_sse2(in);
+      iadst4_sse2(in);
       break;
     case 2:  // DCT_ADST
-      iadst4_1d_sse2(in);
-      idct4_1d_sse2(in);
+      iadst4_sse2(in);
+      idct4_sse2(in);
       break;
     case 3:  // ADST_ADST
-      iadst4_1d_sse2(in);
-      iadst4_1d_sse2(in);
+      iadst4_sse2(in);
+      iadst4_sse2(in);
       break;
     default:
       assert(0);
@@ -455,7 +455,7 @@
       res1 = _mm_packs_epi32(tmp2, tmp3); \
   }
 
-#define IDCT8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \
+#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
                  out0, out1, out2, out3, out4, out5, out6, out7)  \
   { \
   /* Stage1 */      \
@@ -573,7 +573,7 @@
                   in0, in1, in2, in3, in4, in5, in6, in7);
 
     // 4-stage 1D idct8x8
-    IDCT8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
              in0, in1, in2, in3, in4, in5, in6, in7);
   }
 
@@ -674,7 +674,7 @@
   out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
 }
 
-static void idct8_1d_sse2(__m128i *in) {
+static void idct8_sse2(__m128i *in) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
@@ -695,11 +695,11 @@
                 in0, in1, in2, in3, in4, in5, in6, in7);
 
   // 4-stage 1D idct8x8
-  IDCT8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
            in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
 }
 
-static void iadst8_1d_sse2(__m128i *in) {
+static void iadst8_sse2(__m128i *in) {
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
@@ -946,20 +946,20 @@
 
   switch (tx_type) {
     case 0:  // DCT_DCT
-      idct8_1d_sse2(in);
-      idct8_1d_sse2(in);
+      idct8_sse2(in);
+      idct8_sse2(in);
       break;
     case 1:  // ADST_DCT
-      idct8_1d_sse2(in);
-      iadst8_1d_sse2(in);
+      idct8_sse2(in);
+      iadst8_sse2(in);
       break;
     case 2:  // DCT_ADST
-      iadst8_1d_sse2(in);
-      idct8_1d_sse2(in);
+      iadst8_sse2(in);
+      idct8_sse2(in);
       break;
     case 3:  // ADST_ADST
-      iadst8_1d_sse2(in);
-      iadst8_1d_sse2(in);
+      iadst8_sse2(in);
+      iadst8_sse2(in);
       break;
     default:
       assert(0);
@@ -1104,7 +1104,7 @@
 
   TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
 
-  IDCT8_1D(in0, in1, in2, in3, zero, zero, zero, zero,
+  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
            in0, in1, in2, in3, in4, in5, in6, in7);
   // Final rounding and shift
   in0 = _mm_adds_epi16(in0, final_rounding);
@@ -1135,7 +1135,7 @@
   RECON_AND_STORE(dest, in7);
 }
 
-#define IDCT16_1D \
+#define IDCT16 \
   /* Stage2 */ \
   { \
     const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
@@ -1264,7 +1264,7 @@
                            stp2_10, stp2_13, stp2_11, stp2_12) \
   }
 
-#define IDCT16_10_1D \
+#define IDCT16_10 \
     /* Stage2 */ \
     { \
       const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
@@ -1437,7 +1437,7 @@
       array_transpose_8x8(in, in);
       array_transpose_8x8(in+8, in+8);
 
-      IDCT16_1D
+      IDCT16
 
       // Stage7
       curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
@@ -1465,7 +1465,7 @@
       array_transpose_8x8(l+i*8, in);
       array_transpose_8x8(r+i*8, in+8);
 
-      IDCT16_1D
+      IDCT16
 
       // 2-D
       in[0] = _mm_add_epi16(stp2_0, stp1_15);
@@ -1590,7 +1590,7 @@
   res0[15] = tbuf[7];
 }
 
-static void iadst16_1d_8col(__m128i *in) {
+static void iadst16_8col(__m128i *in) {
   // perform 16x16 1-D ADST for 8 columns
   __m128i s[16], x[16], u[32], v[32];
   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -2060,7 +2060,7 @@
   in[15] = _mm_sub_epi16(kZero, s[1]);
 }
 
-static void idct16_1d_8col(__m128i *in) {
+static void idct16_8col(__m128i *in) {
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
@@ -2404,16 +2404,16 @@
   in[15] = _mm_sub_epi16(s[0], s[15]);
 }
 
-static void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
+static void idct16_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
-  idct16_1d_8col(in0);
-  idct16_1d_8col(in1);
+  idct16_8col(in0);
+  idct16_8col(in1);
 }
 
-static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
+static void iadst16_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
-  iadst16_1d_8col(in0);
-  iadst16_1d_8col(in1);
+  iadst16_8col(in0);
+  iadst16_8col(in1);
 }
 
 static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
@@ -2502,20 +2502,20 @@
 
   switch (tx_type) {
     case 0:  // DCT_DCT
-      idct16_1d_sse2(in0, in1);
-      idct16_1d_sse2(in0, in1);
+      idct16_sse2(in0, in1);
+      idct16_sse2(in0, in1);
       break;
     case 1:  // ADST_DCT
-      idct16_1d_sse2(in0, in1);
-      iadst16_1d_sse2(in0, in1);
+      idct16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
       break;
     case 2:  // DCT_ADST
-      iadst16_1d_sse2(in0, in1);
-      idct16_1d_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      idct16_sse2(in0, in1);
       break;
     case 3:  // ADST_ADST
-      iadst16_1d_sse2(in0, in1);
-      iadst16_1d_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
       break;
     default:
       assert(0);
@@ -2732,7 +2732,7 @@
   for (i = 0; i < 2; i++) {
     array_transpose_4X8(l + 8*i, in);
 
-    IDCT16_10_1D
+    IDCT16_10
 
     // Stage7
     in[0] = _mm_add_epi16(stp2_0, stp1_15);
@@ -2814,7 +2814,7 @@
     input += 8; \
   }  \
 
-#define IDCT32_1D_34 \
+#define IDCT32_34 \
 /* Stage1 */ \
 { \
   const __m128i zero = _mm_setzero_si128();\
@@ -3115,7 +3115,7 @@
 }
 
 
-#define IDCT32_1D \
+#define IDCT32 \
 /* Stage1 */ \
 { \
   const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
@@ -3554,7 +3554,7 @@
   array_transpose_8x8(in+16, in+16);
   array_transpose_8x8(in+24, in+24);
 
-  IDCT32_1D
+  IDCT32
 
   // 1_D: Store 32 intermediate results for each 8x32 block.
   col[0] = _mm_add_epi16(stp1_0, stp1_31);
@@ -3593,7 +3593,7 @@
       const __m128i zero = _mm_setzero_si128();
       // Transpose 32x8 block to 8x32 block
       array_transpose_8x8(col+i*8, in);
-      IDCT32_1D_34
+      IDCT32_34
 
       // 2_D: Calculate the results and store them to destination.
       in[0] = _mm_add_epi16(stp1_0, stp1_31);
@@ -3922,7 +3922,7 @@
       array_transpose_8x8(in+16, in+16);
       array_transpose_8x8(in+24, in+24);
 
-      IDCT32_1D
+      IDCT32
 
       // 1_D: Store 32 intermediate results for each 8x32 block.
       col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
@@ -3969,7 +3969,7 @@
       array_transpose_8x8(col+j+64, in+16);
       array_transpose_8x8(col+j+96, in+24);
 
-      IDCT32_1D
+      IDCT32
 
       // 2_D: Calculate the results and store them to destination.
       in[0] = _mm_add_epi16(stp1_0, stp1_31);

diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 466b180..1d9be53 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c

@@ -421,8 +421,7 @@
     if (has_second_ref(mbmi))
       set_ref(cm, xd, 1, mi_row, mi_col);
 
-    xd->subpix.filter_x = xd->subpix.filter_y =
-        vp9_get_interp_kernel(mbmi->interp_filter);
+    xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
 
     // Prediction
     vp9_dec_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);

diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 0f4a6bb..a840b48 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c

@@ -997,7 +997,7 @@
   return rv;
 }
 
-static void dct32_1d(const int *input, int *output, int round) {
+static void fdct32(const int *input, int *output, int round) {
   int step[32];
   // Stage 1
   step[0] = input[0] + input[(32 - 1)];
@@ -1329,7 +1329,7 @@
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = input[j * stride + i] * 4;
-    dct32_1d(temp_in, temp_out, 0);
+    fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   }
@@ -1339,13 +1339,13 @@
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = output[j + i * 32];
-    dct32_1d(temp_in, temp_out, 0);
+    fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
   }
 }
 
-// Note that although we use dct_32_round in dct32_1d computation flow,
+// Note that although we use dct_32_round in dct32 computation flow,
 // this 2d fdct32x32 for rate-distortion optimization loop is operating
 // within 16 bits precision.
 void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
@@ -1357,7 +1357,7 @@
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = input[j * stride + i] * 4;
-    dct32_1d(temp_in, temp_out, 0);
+    fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       // TODO(cd): see quality impact of only doing
       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
@@ -1370,7 +1370,7 @@
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = output[j + i * 32];
-    dct32_1d(temp_in, temp_out, 1);
+    fdct32(temp_in, temp_out, 1);
     for (j = 0; j < 32; ++j)
       out[j + i * 32] = temp_out[j];
   }

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 317ac98..e5d4583 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -2681,6 +2681,7 @@
       vp9_update_zbin_extra(cpi, x);
     }
   } else {
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
     vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
 
     if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index d148530..3b641a1 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c

@@ -27,20 +27,9 @@
 
 void vp9_setup_interp_filters(MACROBLOCKD *xd, INTERP_FILTER filter,
                               VP9_COMMON *cm) {
-  if (xd->mi_8x8 && xd->mi_8x8[0]) {
-    MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-
-    set_ref_ptrs(cm, xd, mbmi->ref_frame[0] - LAST_FRAME,
-                 mbmi->ref_frame[1] - LAST_FRAME);
-
-  } else {
-    set_ref_ptrs(cm, xd, -1, -1);
-  }
-
-  xd->subpix.filter_x = xd->subpix.filter_y =
-      vp9_get_interp_kernel(filter == SWITCHABLE ? EIGHTTAP : filter);
-
-  assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
+  xd->interp_kernel = vp9_get_interp_kernel(filter == SWITCHABLE ? EIGHTTAP
+                                                                 : filter);
+  assert(((intptr_t)xd->interp_kernel & 0xff) == 0);
 }
 
 void vp9_subtract_block_c(int rows, int cols,

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 395ce20..28b343c 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -447,6 +447,16 @@
   }
 }
 
+static BLOCK_SIZE get_bsize(const VP9_COMMON *cm, int mb_row, int mb_col) {
+  if (2 * mb_col + 1 < cm->mi_cols) {
+    return 2 * mb_row + 1 < cm->mi_rows ? BLOCK_16X16
+                                        : BLOCK_16X8;
+  } else {
+    return 2 * mb_row + 1 < cm->mi_rows ? BLOCK_8X16
+                                        : BLOCK_8X8;
+  }
+}
+
 void vp9_first_pass(VP9_COMP *cpi) {
   int mb_row, mb_col;
   MACROBLOCK *const x = &cpi->mb;
@@ -542,6 +552,7 @@
       int this_error;
       int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
       double error_weight = 1.0;
+      const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
 
       vp9_clear_system_state();  // __asm emms;
 
@@ -549,30 +560,15 @@
       xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
       xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
       xd->left_available = (mb_col != 0);
-
-      if (mb_col * 2 + 1 < cm->mi_cols) {
-        if (mb_row * 2 + 1 < cm->mi_rows) {
-          xd->mi_8x8[0]->mbmi.sb_type = BLOCK_16X16;
-        } else {
-          xd->mi_8x8[0]->mbmi.sb_type = BLOCK_16X8;
-        }
-      } else {
-        if (mb_row * 2 + 1 < cm->mi_rows) {
-          xd->mi_8x8[0]->mbmi.sb_type = BLOCK_8X16;
-        } else {
-          xd->mi_8x8[0]->mbmi.sb_type = BLOCK_8X8;
-        }
-      }
+      xd->mi_8x8[0]->mbmi.sb_type = bsize;
       xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME;
       set_mi_row_col(xd, &tile,
-                     mb_row << 1,
-                     num_8x8_blocks_high_lookup[xd->mi_8x8[0]->mbmi.sb_type],
-                     mb_col << 1,
-                     num_8x8_blocks_wide_lookup[xd->mi_8x8[0]->mbmi.sb_type],
+                     mb_row << 1, num_8x8_blocks_high_lookup[bsize],
+                     mb_col << 1, num_8x8_blocks_wide_lookup[bsize],
                      cm->mi_rows, cm->mi_cols);
 
       if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-        int energy = vp9_block_energy(cpi, x, xd->mi_8x8[0]->mbmi.sb_type);
+        int energy = vp9_block_energy(cpi, x, bsize);
         error_weight = vp9_vaq_inv_q_ratio(energy);
       }
 
@@ -692,9 +688,8 @@
           xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
           xd->mi_8x8[0]->mbmi.ref_frame[0] = LAST_FRAME;
           xd->mi_8x8[0]->mbmi.ref_frame[1] = NONE;
-          vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1,
-                                         xd->mi_8x8[0]->mbmi.sb_type);
-          vp9_encode_sby(x, xd->mi_8x8[0]->mbmi.sb_type);
+          vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
+          vp9_encode_sby(x, bsize);
           sum_mvr += mv.as_mv.row;
           sum_mvr_abs += abs(mv.as_mv.row);
           sum_mvc += mv.as_mv.col;

diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 1852d1e..8f06af1 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c

@@ -839,6 +839,7 @@
   if (speed >= 5) {
     int i;
     sf->disable_split_mask = DISABLE_ALL_SPLIT;
+    sf->subpel_force_stop = 1;
     for (i = 0; i < TX_SIZES; i++) {
       sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
       sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
@@ -867,6 +868,7 @@
   sf->recode_loop = 1;
   sf->subpel_search_method = SUBPEL_TREE;
   sf->subpel_iters_per_step = 2;
+  sf->subpel_force_stop = 0;
   sf->optimize_coefficients = !cpi->oxcf.lossless;
   sf->reduce_first_step_size = 0;
   sf->auto_mv_step_size = 0;
@@ -3395,6 +3397,7 @@
                             int64_t *time_stamp, int64_t *time_end, int flush) {
   VP9_COMP *cpi = (VP9_COMP *) ptr;
   VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
   struct vpx_usec_timer  cmptimer;
   YV12_BUFFER_CONFIG *force_src_buffer = NULL;
   MV_REFERENCE_FRAME ref_frame;
@@ -3578,7 +3581,8 @@
       vp9_extend_frame_borders(buf, cm->subsampling_x, cm->subsampling_y);
   }
 
-  vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);
+  set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
+  vp9_setup_interp_filters(xd, DEFAULT_INTERP_FILTER, cm);
 
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
       vp9_vaq_init();

diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 9cf3f62..7a12f12 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h

@@ -256,6 +256,9 @@
   // Maximum number of steps in logarithmic subpel search before giving up.
   int subpel_iters_per_step;
 
+  // Control when to stop subpel search
+  int subpel_force_stop;
+
   // Thresh_mult is used to set a threshold for the rd score. A higher value
   // means that we will accept the best mode so far more often. This number
   // is used in combination with the current block size, and thresh_freq_fact
@@ -821,6 +824,12 @@
   return mb_rows * mb_cols * (48 * 16 + 4);
 }
 
+static void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
+                         MV_REFERENCE_FRAME ref0, MV_REFERENCE_FRAME ref1) {
+  xd->block_refs[0] = &cm->frame_refs[ref0 - LAST_FRAME];
+  xd->block_refs[1] = &cm->frame_refs[ref1 - LAST_FRAME];
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 9d73df2..0c0a20f 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c

@@ -34,6 +34,22 @@
 void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) {
 }
 
+static int try_filter_frame(const YV12_BUFFER_CONFIG *sd, VP9_COMP *const cpi,
+                            MACROBLOCKD *const xd, VP9_COMMON *const cm,
+                            int filt_level, int partial) {
+  int filt_err;
+
+  vp9_set_alt_lf_level(cpi, filt_level);
+  vp9_loop_filter_frame(cm, xd, filt_level, 1, partial);
+
+  filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
+
+  // Re-instate the unfiltered frame
+  vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+
+  return filt_err;
+}
+
 static void search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
                                 int partial) {
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
@@ -41,31 +57,30 @@
   struct loopfilter *const lf = &cm->lf;
   const int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
   const int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
-  int best_err = 0;
-  int filt_err = 0;
+  int best_err;
   int filt_best;
   int filt_direction = 0;
   // Start the search at the previous frame filter level unless it is now out of
   // range.
   int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
   int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
+  // Sum squared error at each filter level
+  int ss_err[MAX_LOOP_FILTER + 1];
+
+  // Set each entry to -1
+  vpx_memset(ss_err, 0xFF, sizeof(ss_err));
 
   //  Make a copy of the unfiltered / processed recon buffer
   vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
 
-  // Get baseline error score
-  vp9_set_alt_lf_level(cpi, filt_mid);
-  vp9_loop_filter_frame(cm, xd, filt_mid, 1, partial);
-
-  best_err = vp9_calc_ss_err(sd, cm->frame_to_show);
+  best_err = try_filter_frame(sd, cpi, xd, cm, filt_mid, partial);
   filt_best = filt_mid;
-
-  //  Re-instate the unfiltered frame
-  vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+  ss_err[filt_mid] = best_err;
 
   while (filter_step > 0) {
     const int filt_high = MIN(filt_mid + filter_step, max_filter_level);
     const int filt_low = MAX(filt_mid - filter_step, min_filter_level);
+    int filt_err;
 
     // Bias against raising loop filter in favor of lowering it.
     int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
@@ -79,14 +94,12 @@
 
     if (filt_direction <= 0 && filt_low != filt_mid) {
       // Get Low filter error score
-      vp9_set_alt_lf_level(cpi, filt_low);
-      vp9_loop_filter_frame(cm, xd, filt_low, 1, partial);
-
-      filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
-
-      // Re-instate the unfiltered frame
-      vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
-
+      if (ss_err[filt_low] < 0) {
+        filt_err = try_filter_frame(sd, cpi, xd, cm, filt_low, partial);
+        ss_err[filt_low] = filt_err;
+      } else {
+        filt_err = ss_err[filt_low];
+      }
       // If value is close to the best so far then bias towards a lower loop
       // filter value.
       if ((filt_err - bias) < best_err) {
@@ -100,14 +113,12 @@
 
     // Now look at filt_high
     if (filt_direction >= 0 && filt_high != filt_mid) {
-      vp9_set_alt_lf_level(cpi, filt_high);
-      vp9_loop_filter_frame(cm, xd, filt_high, 1, partial);
-
-      filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
-
-      //  Re-instate the unfiltered frame
-      vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
-
+      if (ss_err[filt_high] < 0) {
+        filt_err = try_filter_frame(sd, cpi, xd, cm, filt_high, partial);
+        ss_err[filt_high] = filt_err;
+      } else {
+        filt_err = ss_err[filt_high];
+      }
       // Was it better than the previous best?
       if (filt_err < (best_err - bias)) {
         best_err = filt_err;

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 3be79f4..da4cd3a 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -1517,8 +1517,8 @@
     vp9_build_inter_predictor(pre, pd->pre[ref].stride,
                               dst, pd->dst.stride,
                               &mi->bmi[i].as_mv[ref].as_mv,
-                              &xd->block_refs[ref]->sf,
-                              width, height, ref, &xd->subpix, MV_PRECISION_Q3,
+                              &xd->block_refs[ref]->sf, width, height, ref,
+                              xd->interp_kernel, MV_PRECISION_Q3,
                               mi_col * MI_SIZE + 4 * (i % 2),
                               mi_row * MI_SIZE + 4 * (i / 2));
   }
@@ -1840,7 +1840,8 @@
                                          &bsi->ref_mv->as_mv,
                                          cm->allow_high_precision_mv,
                                          x->errorperbit, v_fn_ptr,
-                                         0, cpi->sf.subpel_iters_per_step,
+                                         cpi->sf.subpel_force_stop,
+                                         cpi->sf.subpel_iters_per_step,
                                          x->nmvjointcost, x->mvcost,
                                          &distortion,
                                          &x->pred_sse[mbmi->ref_frame[0]]);
@@ -2451,7 +2452,8 @@
                                  cm->allow_high_precision_mv,
                                  x->errorperbit,
                                  &cpi->fn_ptr[bsize],
-                                 0, cpi->sf.subpel_iters_per_step,
+                                 cpi->sf.subpel_force_stop,
+                                 cpi->sf.subpel_iters_per_step,
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &x->pred_sse[ref]);
   }
@@ -2536,7 +2538,7 @@
                               &frame_mv[refs[!id]].as_mv,
                               &xd->block_refs[!id]->sf,
                               pw, ph, 0,
-                              &xd->subpix, MV_PRECISION_Q3,
+                              xd->interp_kernel, MV_PRECISION_Q3,
                               mi_col * MI_SIZE, mi_row * MI_SIZE);
 
     // Compound motion search on first ref frame.
@@ -3277,7 +3279,7 @@
           continue;
     }
 
-    set_ref_ptrs(cm, xd, ref_frame - 1, second_ref_frame - 1);
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
     mbmi->uv_mode = DC_PRED;
 
     // Evaluate all sub-pel filters irrespective of whether we can use
@@ -3709,7 +3711,7 @@
     vp9_zero(best_tx_diff);
   }
 
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0] - 1, mbmi->ref_frame[1] - 1);
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
   store_coding_context(x, ctx, best_mode_index,
                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
@@ -3902,7 +3904,7 @@
         vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf))
       continue;
 
-    set_ref_ptrs(cm, xd, ref_frame - 1, second_ref_frame - 1);
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
     mbmi->uv_mode = DC_PRED;
 
     // Evaluate all sub-pel filters irrespective of whether we can use
@@ -4442,7 +4444,7 @@
     vp9_zero(best_tx_diff);
   }
 
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0] - 1, mbmi->ref_frame[1] - 1);
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
   store_coding_context(x, ctx, best_mode_index,
                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :

diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c
index f15abc0..0766b51 100644
--- a/vp9/encoder/vp9_resize.c
+++ b/vp9/encoder/vp9_resize.c

@@ -16,7 +16,6 @@
 #include <string.h>
 #include "vp9/common/vp9_common.h"
 #include "vp9/encoder/vp9_resize.h"
-#include "vpx/vpx_integer.h"
 
 #define FILTER_BITS               7
 
@@ -30,8 +29,44 @@
 
 typedef int16_t interp_kernel[INTERP_TAPS];
 
-// Filters for interpolation - note this also filters integer pels.
-const interp_kernel vp9_filteredinterp_filters[(1 << SUBPEL_BITS)] = {
+// Filters for interpolation (0.5-band) - note this also filters integer pels.
+const interp_kernel vp9_filteredinterp_filters500[(1 << SUBPEL_BITS)] = {
+  {-3,  0, 35, 64, 35,  0, -3, 0},
+  {-3, -1, 34, 64, 36,  1, -3, 0},
+  {-3, -1, 32, 64, 38,  1, -3, 0},
+  {-2, -2, 31, 63, 39,  2, -3, 0},
+  {-2, -2, 29, 63, 41,  2, -3, 0},
+  {-2, -2, 28, 63, 42,  3, -4, 0},
+  {-2, -3, 27, 63, 43,  4, -4, 0},
+  {-2, -3, 25, 62, 45,  5, -4, 0},
+  {-2, -3, 24, 62, 46,  5, -4, 0},
+  {-2, -3, 23, 61, 47,  6, -4, 0},
+  {-2, -3, 21, 60, 49,  7, -4, 0},
+  {-1, -4, 20, 60, 50,  8, -4, -1},
+  {-1, -4, 19, 59, 51,  9, -4, -1},
+  {-1, -4, 17, 58, 52, 10, -4, 0},
+  {-1, -4, 16, 57, 53, 12, -4, -1},
+  {-1, -4, 15, 56, 54, 13, -4, -1},
+  {-1, -4, 14, 55, 55, 14, -4, -1},
+  {-1, -4, 13, 54, 56, 15, -4, -1},
+  {-1, -4, 12, 53, 57, 16, -4, -1},
+  {0, -4, 10, 52, 58, 17, -4, -1},
+  {-1, -4,  9, 51, 59, 19, -4, -1},
+  {-1, -4,  8, 50, 60, 20, -4, -1},
+  {0, -4,  7, 49, 60, 21, -3, -2},
+  {0, -4,  6, 47, 61, 23, -3, -2},
+  {0, -4,  5, 46, 62, 24, -3, -2},
+  {0, -4,  5, 45, 62, 25, -3, -2},
+  {0, -4,  4, 43, 63, 27, -3, -2},
+  {0, -4,  3, 42, 63, 28, -2, -2},
+  {0, -3,  2, 41, 63, 29, -2, -2},
+  {0, -3,  2, 39, 63, 31, -2, -2},
+  {0, -3,  1, 38, 64, 32, -1, -3},
+  {0, -3,  1, 36, 64, 34, -1, -3}
+};
+
+// Filters for interpolation (0.625-band) - note this also filters integer pels.
+const interp_kernel vp9_filteredinterp_filters625[(1 << SUBPEL_BITS)] = {
   {-1, -8, 33, 80, 33, -8, -1, 0},
   {-1, -8, 30, 80, 35, -8, -1, 1},
   {-1, -8, 28, 80, 37, -7, -2, 1},
@@ -66,10 +101,132 @@
   {1, -1, -8, 35, 80, 30, -8, -1},
 };
 
+// Filters for interpolation (0.75-band) - note this also filters integer pels.
+const interp_kernel vp9_filteredinterp_filters750[(1 << SUBPEL_BITS)] = {
+  {2, -11,  25,  96,  25, -11,   2, 0},
+  {2, -11,  22,  96,  28, -11,   2, 0},
+  {2, -10,  19,  95,  31, -11,   2, 0},
+  {2, -10,  17,  95,  34, -12,   2, 0},
+  {2,  -9,  14,  94,  37, -12,   2, 0},
+  {2,  -8,  12,  93,  40, -12,   1, 0},
+  {2,  -8,   9,  92,  43, -12,   1, 1},
+  {2,  -7,   7,  91,  46, -12,   1, 0},
+  {2,  -7,   5,  90,  49, -12,   1, 0},
+  {2,  -6,   3,  88,  52, -12,   0, 1},
+  {2,  -5,   1,  86,  55, -12,   0, 1},
+  {2,  -5,  -1,  84,  58, -11,   0, 1},
+  {2,  -4,  -2,  82,  61, -11,  -1, 1},
+  {2,  -4,  -4,  80,  64, -10,  -1, 1},
+  {1, -3, -5, 77, 67, -9, -1, 1},
+  {1, -3, -6, 75, 70, -8, -2, 1},
+  {1, -2, -7, 72, 72, -7, -2, 1},
+  {1, -2, -8, 70, 75, -6, -3, 1},
+  {1, -1, -9, 67, 77, -5, -3, 1},
+  {1,  -1, -10,  64,  80,  -4,  -4, 2},
+  {1,  -1, -11,  61,  82,  -2,  -4, 2},
+  {1,   0, -11,  58,  84,  -1,  -5, 2},
+  {1,   0, -12,  55,  86,   1,  -5, 2},
+  {1,   0, -12,  52,  88,   3,  -6, 2},
+  {0,   1, -12,  49,  90,   5,  -7, 2},
+  {0,   1, -12,  46,  91,   7,  -7, 2},
+  {1,   1, -12,  43,  92,   9,  -8, 2},
+  {0,   1, -12,  40,  93,  12,  -8, 2},
+  {0,   2, -12,  37,  94,  14,  -9, 2},
+  {0,   2, -12,  34,  95,  17, -10, 2},
+  {0,   2, -11,  31,  95,  19, -10, 2},
+  {0,   2, -11,  28,  96,  22, -11, 2}
+};
+
+// Filters for interpolation (0.875-band) - note this also filters integer pels.
+const interp_kernel vp9_filteredinterp_filters875[(1 << SUBPEL_BITS)] = {
+  {3,  -8,  13, 112,  13,  -8,   3, 0},
+  {3,  -7,  10, 112,  17,  -9,   3, -1},
+  {2,  -6,   7, 111,  21,  -9,   3, -1},
+  {2,  -5,   4, 111,  24, -10,   3, -1},
+  {2,  -4,   1, 110,  28, -11,   3, -1},
+  {1,  -3,  -1, 108,  32, -12,   4, -1},
+  {1,  -2,  -3, 106,  36, -13,   4, -1},
+  {1,  -1,  -6, 105,  40, -14,   4, -1},
+  {1,  -1,  -7, 102,  44, -14,   4, -1},
+  {1,   0,  -9, 100,  48, -15,   4, -1},
+  {1,   1, -11,  97,  53, -16,   4, -1},
+  {0,   1, -12,  95,  57, -16,   4, -1},
+  {0,   2, -13,  91,  61, -16,   4, -1},
+  {0,   2, -14,  88,  65, -16,   4, -1},
+  {0,   3, -15,  84,  69, -17,   4, 0},
+  {0,   3, -16,  81,  73, -16,   3, 0},
+  {0,   3, -16,  77,  77, -16,   3, 0},
+  {0,   3, -16,  73,  81, -16,   3, 0},
+  {0,   4, -17,  69,  84, -15,   3, 0},
+  {-1,   4, -16,  65,  88, -14,   2, 0},
+  {-1,   4, -16,  61,  91, -13,   2, 0},
+  {-1,   4, -16,  57,  95, -12,   1, 0},
+  {-1,   4, -16,  53,  97, -11,   1, 1},
+  {-1,   4, -15,  48, 100,  -9,   0, 1},
+  {-1,   4, -14,  44, 102,  -7,  -1, 1},
+  {-1,   4, -14,  40, 105,  -6,  -1, 1},
+  {-1,   4, -13,  36, 106,  -3,  -2, 1},
+  {-1,   4, -12,  32, 108,  -1,  -3, 1},
+  {-1,   3, -11,  28, 110,   1,  -4, 2},
+  {-1,   3, -10,  24, 111,   4,  -5, 2},
+  {-1,   3,  -9,  21, 111,   7,  -6, 2},
+  {-1,   3,  -9,  17, 112,  10,  -7, 3}
+};
+
+// Filters for interpolation (full-band) - no filtering for integer pixels
+const interp_kernel vp9_filteredinterp_filters1000[(1 << SUBPEL_BITS)] = {
+  {0,   0,   0, 128,   0,   0,   0, 0},
+  {0,   1,  -3, 128,   3,  -1,   0, 0},
+  {-1,   2,  -6, 127,   7,  -2,   1, 0},
+  {-1,   3,  -9, 126,  12,  -4,   1, 0},
+  {-1,   4, -12, 125,  16,  -5,   1, 0},
+  {-1,   4, -14, 123,  20,  -6,   2, 0},
+  {-1,   5, -15, 120,  25,  -8,   2, 0},
+  {-1,   5, -17, 118,  30,  -9,   3, -1},
+  {-1,   6, -18, 114,  35, -10,   3, -1},
+  {-1,   6, -19, 111,  41, -12,   3, -1},
+  {-1,   6, -20, 107,  46, -13,   4, -1},
+  {-1,   6, -21, 103,  52, -14,   4, -1},
+  {-1,   6, -21,  99,  57, -16,   5, -1},
+  {-1,   6, -21,  94,  63, -17,   5, -1},
+  {-1,   6, -20,  89,  68, -18,   5, -1},
+  {-1,   6, -20,  84,  73, -19,   6, -1},
+  {-1,   6, -20,  79,  79, -20,   6, -1},
+  {-1,   6, -19,  73,  84, -20,   6, -1},
+  {-1,   5, -18,  68,  89, -20,   6, -1},
+  {-1,   5, -17,  63,  94, -21,   6, -1},
+  {-1,   5, -16,  57,  99, -21,   6, -1},
+  {-1,   4, -14,  52, 103, -21,   6, -1},
+  {-1,   4, -13,  46, 107, -20,   6, -1},
+  {-1,   3, -12,  41, 111, -19,   6, -1},
+  {-1,   3, -10,  35, 114, -18,   6, -1},
+  {-1,   3,  -9,  30, 118, -17,   5, -1},
+  {0,   2,  -8,  25, 120, -15,   5, -1},
+  {0,   2,  -6,  20, 123, -14,   4, -1},
+  {0,   1,  -5,  16, 125, -12,   4, -1},
+  {0,   1,  -4,  12, 126,  -9,   3, -1},
+  {0,   1,  -2,   7, 127,  -6,   2, -1},
+  {0,   0,  -1,   3, 128,  -3,   1, 0}
+};
+
 // Filters for factor of 2 downsampling.
 static const int16_t vp9_down2_symeven_half_filter[] = {56, 12, -3, -1};
 static const int16_t vp9_down2_symodd_half_filter[] = {64, 35, 0, -3};
 
+static const interp_kernel *choose_interp_filter(int inlength, int outlength) {
+  int outlength16 = outlength * 16;
+  if (outlength16 >= inlength * 16)
+    return vp9_filteredinterp_filters1000;
+  else if (outlength16 >= inlength * 13)
+    return vp9_filteredinterp_filters875;
+  else if (outlength16 >= inlength * 11)
+    return vp9_filteredinterp_filters750;
+  else if (outlength16 >= inlength * 9)
+    return vp9_filteredinterp_filters625;
+  else
+    return vp9_filteredinterp_filters500;
+}
+
 static void interpolate(const uint8_t *const input, int inlength,
                         uint8_t *output, int outlength) {
   const int64_t delta = (((uint64_t)inlength << 32) + outlength / 2) /
@@ -81,6 +238,9 @@
   int x, x1, x2, sum, k, int_pel, sub_pel;
   int64_t y;
 
+  const interp_kernel *interp_filters =
+      choose_interp_filter(inlength, outlength);
+
   x = 0;
   y = offset;
   while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) {
@@ -101,7 +261,7 @@
       const int16_t *filter;
       int_pel = y >> INTERP_PRECISION_BITS;
       sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
-      filter = vp9_filteredinterp_filters[sub_pel];
+      filter = interp_filters[sub_pel];
       sum = 0;
       for (k = 0; k < INTERP_TAPS; ++k) {
         const int pk = int_pel - INTERP_TAPS / 2 + 1 + k;
@@ -116,7 +276,7 @@
       const int16_t *filter;
       int_pel = y >> INTERP_PRECISION_BITS;
       sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
-      filter = vp9_filteredinterp_filters[sub_pel];
+      filter = interp_filters[sub_pel];
       sum = 0;
       for (k = 0; k < INTERP_TAPS; ++k)
         sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 ?
@@ -129,7 +289,7 @@
       const int16_t *filter;
       int_pel = y >> INTERP_PRECISION_BITS;
       sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
-      filter = vp9_filteredinterp_filters[sub_pel];
+      filter = interp_filters[sub_pel];
       sum = 0;
       for (k = 0; k < INTERP_TAPS; ++k)
         sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k];
@@ -140,7 +300,7 @@
       const int16_t *filter;
       int_pel = y >> INTERP_PRECISION_BITS;
       sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
-      filter = vp9_filteredinterp_filters[sub_pel];
+      filter = interp_filters[sub_pel];
       sum = 0;
       for (k = 0; k < INTERP_TAPS; ++k)
         sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >=

diff --git a/vp9/encoder/vp9_resize.h b/vp9/encoder/vp9_resize.h
index c67595a..1818cd4 100644
--- a/vp9/encoder/vp9_resize.h
+++ b/vp9/encoder/vp9_resize.h

@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -12,6 +12,7 @@
 #define VP9_ENCODER_VP9_RESIZE_H_
 
 #include <stdio.h>
+#include "vpx/vpx_integer.h"
 
 void vp9_resize_plane(const uint8_t *const input,
                       int height,

diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index c2eea0a..c9a4246 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c

@@ -60,7 +60,7 @@
                             scale,
                             16, 16,
                             which_mv,
-                            &xd->subpix, MV_PRECISION_Q3, x, y);
+                            xd->interp_kernel, MV_PRECISION_Q3, x, y);
 
   vp9_build_inter_predictor(u_mb_ptr, uv_stride,
                             &pred[256], uv_block_size,
@@ -68,7 +68,7 @@
                             scale,
                             uv_block_size, uv_block_size,
                             which_mv,
-                            &xd->subpix, mv_precision_uv, x, y);
+                            xd->interp_kernel, mv_precision_uv, x, y);
 
   vp9_build_inter_predictor(v_mb_ptr, uv_stride,
                             &pred[512], uv_block_size,
@@ -76,7 +76,7 @@
                             scale,
                             uv_block_size, uv_block_size,
                             which_mv,
-                            &xd->subpix, mv_precision_uv, x, y);
+                            xd->interp_kernel, mv_precision_uv, x, y);
 }
 
 void vp9_temporal_filter_apply_c(uint8_t *frame1,

diff --git a/vp9/encoder/x86/vp9_dct_avx2.c b/vp9/encoder/x86/vp9_dct_avx2.c
index d81b72b..ea031fb 100644
--- a/vp9/encoder/x86/vp9_dct_avx2.c
+++ b/vp9/encoder/x86/vp9_dct_avx2.c

@@ -163,7 +163,7 @@
   res[3] = _mm_unpackhi_epi64(res[2], res[2]);
 }
 
-void fdct4_1d_avx2(__m128i *in) {
+void fdct4_avx2(__m128i *in) {
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
@@ -196,7 +196,7 @@
   transpose_4x4_avx2(in);
 }
 
-void fadst4_1d_avx2(__m128i *in) {
+void fadst4_avx2(__m128i *in) {
   const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
   const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
   const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
@@ -250,20 +250,20 @@
   load_buffer_4x4_avx2(input, in, stride);
   switch (tx_type) {
     case 0:  // DCT_DCT
-      fdct4_1d_avx2(in);
-      fdct4_1d_avx2(in);
+      fdct4_avx2(in);
+      fdct4_avx2(in);
       break;
     case 1:  // ADST_DCT
-      fadst4_1d_avx2(in);
-      fdct4_1d_avx2(in);
+      fadst4_avx2(in);
+      fdct4_avx2(in);
       break;
     case 2:  // DCT_ADST
-      fdct4_1d_avx2(in);
-      fadst4_1d_avx2(in);
+      fdct4_avx2(in);
+      fadst4_avx2(in);
       break;
     case 3:  // ADST_ADST
-      fadst4_1d_avx2(in);
-      fadst4_1d_avx2(in);
+      fadst4_avx2(in);
+      fadst4_avx2(in);
       break;
     default:
       assert(0);
@@ -658,7 +658,7 @@
   // 07 17 27 37 47 57 67 77
 }
 
-void fdct8_1d_avx2(__m128i *in) {
+void fdct8_avx2(__m128i *in) {
   // constants
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
@@ -798,7 +798,7 @@
   array_transpose_8x8_avx2(in, in);
 }
 
-void fadst8_1d_avx2(__m128i *in) {
+void fadst8_avx2(__m128i *in) {
   // Constants
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
@@ -1034,20 +1034,20 @@
   load_buffer_8x8_avx2(input, in, stride);
   switch (tx_type) {
     case 0:  // DCT_DCT
-      fdct8_1d_avx2(in);
-      fdct8_1d_avx2(in);
+      fdct8_avx2(in);
+      fdct8_avx2(in);
       break;
     case 1:  // ADST_DCT
-      fadst8_1d_avx2(in);
-      fdct8_1d_avx2(in);
+      fadst8_avx2(in);
+      fdct8_avx2(in);
       break;
     case 2:  // DCT_ADST
-      fdct8_1d_avx2(in);
-      fadst8_1d_avx2(in);
+      fdct8_avx2(in);
+      fadst8_avx2(in);
       break;
     case 3:  // ADST_ADST
-      fadst8_1d_avx2(in);
-      fadst8_1d_avx2(in);
+      fadst8_avx2(in);
+      fadst8_avx2(in);
       break;
     default:
       assert(0);
@@ -1216,7 +1216,7 @@
         step1_6 = _mm_sub_epi16(in01, in14);
         step1_7 = _mm_sub_epi16(in00, in15);
       }
-      // Work on the first eight values; fdct8_1d(input, even_results);
+      // Work on the first eight values; fdct8(input, even_results);
       {
         // Add/substract
         const __m128i q0 = _mm_add_epi16(input0, input7);
@@ -1730,7 +1730,7 @@
   right_shift_8x8_avx2(res1 + 8, 2);
 }
 
-void fdct16_1d_8col_avx2(__m128i *in) {
+void fdct16_8col_avx2(__m128i *in) {
   // perform 16x16 1-D DCT for 8 columns
   __m128i i[8], s[8], p[8], t[8], u[16], v[16];
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
@@ -2052,7 +2052,7 @@
   in[15] = _mm_packs_epi32(v[14], v[15]);
 }
 
-void fadst16_1d_8col_avx2(__m128i *in) {
+void fadst16_8col_avx2(__m128i *in) {
   // perform 16x16 1-D ADST for 8 columns
   __m128i s[16], x[16], u[32], v[32];
   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -2522,15 +2522,15 @@
   in[15] = _mm_sub_epi16(kZero, s[1]);
 }
 
-void fdct16_1d_avx2(__m128i *in0, __m128i *in1) {
-  fdct16_1d_8col_avx2(in0);
-  fdct16_1d_8col_avx2(in1);
+void fdct16_avx2(__m128i *in0, __m128i *in1) {
+  fdct16_8col_avx2(in0);
+  fdct16_8col_avx2(in1);
   array_transpose_16x16_avx2(in0, in1);
 }
 
-void fadst16_1d_avx2(__m128i *in0, __m128i *in1) {
-  fadst16_1d_8col_avx2(in0);
-  fadst16_1d_8col_avx2(in1);
+void fadst16_avx2(__m128i *in0, __m128i *in1) {
+  fadst16_8col_avx2(in0);
+  fadst16_8col_avx2(in1);
   array_transpose_16x16_avx2(in0, in1);
 }
 
@@ -2540,24 +2540,24 @@
   load_buffer_16x16_avx2(input, in0, in1, stride);
   switch (tx_type) {
     case 0:  // DCT_DCT
-      fdct16_1d_avx2(in0, in1);
+      fdct16_avx2(in0, in1);
       right_shift_16x16_avx2(in0, in1);
-      fdct16_1d_avx2(in0, in1);
+      fdct16_avx2(in0, in1);
       break;
     case 1:  // ADST_DCT
-      fadst16_1d_avx2(in0, in1);
+      fadst16_avx2(in0, in1);
       right_shift_16x16_avx2(in0, in1);
-      fdct16_1d_avx2(in0, in1);
+      fdct16_avx2(in0, in1);
       break;
     case 2:  // DCT_ADST
-      fdct16_1d_avx2(in0, in1);
+      fdct16_avx2(in0, in1);
       right_shift_16x16_avx2(in0, in1);
-      fadst16_1d_avx2(in0, in1);
+      fadst16_avx2(in0, in1);
       break;
     case 3:  // ADST_ADST
-      fadst16_1d_avx2(in0, in1);
+      fadst16_avx2(in0, in1);
       right_shift_16x16_avx2(in0, in1);
-      fadst16_1d_avx2(in0, in1);
+      fadst16_avx2(in0, in1);
       break;
     default:
       assert(0);

diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index 65431bd..c876cc2 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c

@@ -161,7 +161,7 @@
   res[3] = _mm_unpackhi_epi64(res[2], res[2]);
 }
 
-void fdct4_1d_sse2(__m128i *in) {
+void fdct4_sse2(__m128i *in) {
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
@@ -194,7 +194,7 @@
   transpose_4x4(in);
 }
 
-void fadst4_1d_sse2(__m128i *in) {
+void fadst4_sse2(__m128i *in) {
   const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
   const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
   const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
@@ -248,20 +248,20 @@
   load_buffer_4x4(input, in, stride);
   switch (tx_type) {
     case 0:  // DCT_DCT
-      fdct4_1d_sse2(in);
-      fdct4_1d_sse2(in);
+      fdct4_sse2(in);
+      fdct4_sse2(in);
       break;
     case 1:  // ADST_DCT
-      fadst4_1d_sse2(in);
-      fdct4_1d_sse2(in);
+      fadst4_sse2(in);
+      fdct4_sse2(in);
       break;
     case 2:  // DCT_ADST
-      fdct4_1d_sse2(in);
-      fadst4_1d_sse2(in);
+      fdct4_sse2(in);
+      fadst4_sse2(in);
       break;
     case 3:  // ADST_ADST
-      fadst4_1d_sse2(in);
-      fadst4_1d_sse2(in);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
       break;
     default:
       assert(0);
@@ -656,7 +656,7 @@
   // 07 17 27 37 47 57 67 77
 }
 
-void fdct8_1d_sse2(__m128i *in) {
+void fdct8_sse2(__m128i *in) {
   // constants
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
@@ -796,7 +796,7 @@
   array_transpose_8x8(in, in);
 }
 
-void fadst8_1d_sse2(__m128i *in) {
+void fadst8_sse2(__m128i *in) {
   // Constants
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
@@ -1032,20 +1032,20 @@
   load_buffer_8x8(input, in, stride);
   switch (tx_type) {
     case 0:  // DCT_DCT
-      fdct8_1d_sse2(in);
-      fdct8_1d_sse2(in);
+      fdct8_sse2(in);
+      fdct8_sse2(in);
       break;
     case 1:  // ADST_DCT
-      fadst8_1d_sse2(in);
-      fdct8_1d_sse2(in);
+      fadst8_sse2(in);
+      fdct8_sse2(in);
       break;
     case 2:  // DCT_ADST
-      fdct8_1d_sse2(in);
-      fadst8_1d_sse2(in);
+      fdct8_sse2(in);
+      fadst8_sse2(in);
       break;
     case 3:  // ADST_ADST
-      fadst8_1d_sse2(in);
-      fadst8_1d_sse2(in);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
       break;
     default:
       assert(0);
@@ -1214,7 +1214,7 @@
         step1_6 = _mm_sub_epi16(in01, in14);
         step1_7 = _mm_sub_epi16(in00, in15);
       }
-      // Work on the first eight values; fdct8_1d(input, even_results);
+      // Work on the first eight values; fdct8(input, even_results);
       {
         // Add/substract
         const __m128i q0 = _mm_add_epi16(input0, input7);
@@ -1728,7 +1728,7 @@
   right_shift_8x8(res1 + 8, 2);
 }
 
-void fdct16_1d_8col(__m128i *in) {
+void fdct16_8col(__m128i *in) {
   // perform 16x16 1-D DCT for 8 columns
   __m128i i[8], s[8], p[8], t[8], u[16], v[16];
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
@@ -2050,7 +2050,7 @@
   in[15] = _mm_packs_epi32(v[14], v[15]);
 }
 
-void fadst16_1d_8col(__m128i *in) {
+void fadst16_8col(__m128i *in) {
   // perform 16x16 1-D ADST for 8 columns
   __m128i s[16], x[16], u[32], v[32];
   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -2520,15 +2520,15 @@
   in[15] = _mm_sub_epi16(kZero, s[1]);
 }
 
-void fdct16_1d_sse2(__m128i *in0, __m128i *in1) {
-  fdct16_1d_8col(in0);
-  fdct16_1d_8col(in1);
+void fdct16_sse2(__m128i *in0, __m128i *in1) {
+  fdct16_8col(in0);
+  fdct16_8col(in1);
   array_transpose_16x16(in0, in1);
 }
 
-void fadst16_1d_sse2(__m128i *in0, __m128i *in1) {
-  fadst16_1d_8col(in0);
-  fadst16_1d_8col(in1);
+void fadst16_sse2(__m128i *in0, __m128i *in1) {
+  fadst16_8col(in0);
+  fadst16_8col(in1);
   array_transpose_16x16(in0, in1);
 }
 
@@ -2538,24 +2538,24 @@
   load_buffer_16x16(input, in0, in1, stride);
   switch (tx_type) {
     case 0:  // DCT_DCT
-      fdct16_1d_sse2(in0, in1);
+      fdct16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
-      fdct16_1d_sse2(in0, in1);
+      fdct16_sse2(in0, in1);
       break;
     case 1:  // ADST_DCT
-      fadst16_1d_sse2(in0, in1);
+      fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
-      fdct16_1d_sse2(in0, in1);
+      fdct16_sse2(in0, in1);
       break;
     case 2:  // DCT_ADST
-      fdct16_1d_sse2(in0, in1);
+      fdct16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
-      fadst16_1d_sse2(in0, in1);
+      fadst16_sse2(in0, in1);
       break;
     case 3:  // ADST_ADST
-      fadst16_1d_sse2(in0, in1);
+      fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
-      fadst16_1d_sse2(in0, in1);
+      fadst16_sse2(in0, in1);
       break;
     default:
       assert(0);

diff --git a/vp9_spatial_scalable_encoder.c b/vp9_spatial_scalable_encoder.c
index b637331..e71094a 100644
--- a/vp9_spatial_scalable_encoder.c
+++ b/vp9_spatial_scalable_encoder.c

@@ -89,14 +89,6 @@
   exit(EXIT_FAILURE);
 }
 
-static void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
-  const char *detail = vpx_codec_error_detail(ctx);
-
-  printf("%s: %s\n", s, vpx_codec_error(ctx));
-  if (detail) printf("    %s\n", detail);
-  exit(EXIT_FAILURE);
-}
-
 static void parse_command_line(int argc, const char **argv_,
                                AppInput *app_input, SvcContext *svc_ctx,
                                vpx_codec_enc_cfg_t *enc_cfg) {