Merge "Add a strict mode for auto_min_max_partition_size feature"
diff --git a/examples.mk b/examples.mk
index 6940353..0b62df9 100644
--- a/examples.mk
+++ b/examples.mk
@@ -41,6 +41,7 @@
 vpxenc.SRCS                 += args.c args.h y4minput.c y4minput.h vpxenc.h
 vpxenc.SRCS                 += ivfdec.c ivfdec.h
 vpxenc.SRCS                 += ivfenc.c ivfenc.h
+vpxenc.SRCS                 += rate_hist.c rate_hist.h
 vpxenc.SRCS                 += tools_common.c tools_common.h
 vpxenc.SRCS                 += warnings.c warnings.h
 vpxenc.SRCS                 += webmenc.c webmenc.h
@@ -79,6 +80,8 @@
 simple_decoder.SRCS                += tools_common.h tools_common.c
 simple_decoder.DESCRIPTION         = Simplified decoder loop
 GEN_EXAMPLES-$(CONFIG_VP8_DECODER) += postproc.c
+postproc.SRCS                      += ivfdec.h ivfdec.c
+postproc.SRCS                      += tools_common.h tools_common.c
 postproc.GUID                    = 65E33355-F35E-4088-884D-3FD4905881D7
 postproc.DESCRIPTION             = Decoder postprocessor control
 GEN_EXAMPLES-$(CONFIG_VP8_DECODER) += decode_to_md5.c
@@ -99,6 +102,8 @@
 force_keyframe.DESCRIPTION       = Force generation of keyframes
 ifeq ($(CONFIG_DECODERS),yes)
 GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += decode_with_drops.c
+decode_with_drops.SRCS             += ivfdec.h ivfdec.c
+decode_with_drops.SRCS             += tools_common.h tools_common.c
 endif
 decode_with_drops.GUID           = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26
 decode_with_drops.DESCRIPTION    = Drops frames while decoding
diff --git a/examples/decode_with_drops.c b/examples/decode_with_drops.c
index bfb6d3a..12686de 100644
--- a/examples/decode_with_drops.c
+++ b/examples/decode_with_drops.c
@@ -52,126 +52,103 @@
 // The example decides whether to drop the frame based on the current
 // frame number, immediately before decoding the frame.
 
-#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
+#include "./ivfdec.h"
+
 #define VPX_CODEC_DISABLE_COMPAT 1
-#include "./vpx_config.h"
+
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
-#define interface (vpx_codec_vp8_dx())
 
+#include "./tools_common.h"
+#include "./vpx_config.h"
 
-#define IVF_FILE_HDR_SZ  (32)
-#define IVF_FRAME_HDR_SZ (12)
+static const char *exec_name;
 
-static unsigned int mem_get_le32(const unsigned char *mem) {
-    return (mem[3] << 24)|(mem[2] << 16)|(mem[1] << 8)|(mem[0]);
+void usage_exit() {
+  fprintf(stderr, "Usage: %s <infile> <outfile> <N-M|N/M>\n", exec_name);
+  exit(EXIT_FAILURE);
 }
 
-static void die(const char *fmt, ...) {
-    va_list ap;
-
-    va_start(ap, fmt);
-    vprintf(fmt, ap);
-    if(fmt[strlen(fmt)-1] != '\n')
-        printf("\n");
-    exit(EXIT_FAILURE);
-}
-
-static void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
-    const char *detail = vpx_codec_error_detail(ctx);
-
-    printf("%s: %s\n", s, vpx_codec_error(ctx));
-    if(detail)
-        printf("    %s\n",detail);
-    exit(EXIT_FAILURE);
-}
-
-
 int main(int argc, char **argv) {
-    FILE            *infile, *outfile;
-    vpx_codec_ctx_t  codec;
-    int              flags = 0, frame_cnt = 0;
-    unsigned char    file_hdr[IVF_FILE_HDR_SZ];
-    unsigned char    frame_hdr[IVF_FRAME_HDR_SZ];
-    unsigned char    frame[256*1024];
-    vpx_codec_err_t  res;
-    int              n, m, is_range;
+  FILE *infile, *outfile;
+  vpx_codec_ctx_t codec;
+  vpx_codec_iface_t *iface;
+  int flags = 0, frame_cnt = 0;
+  vpx_video_t *video;
+  int n, m, is_range;
+  char *nptr;
 
-    (void)res;
-    /* Open files */
-    if(argc!=4)
-        die("Usage: %s <infile> <outfile> <N-M|N/M>\n", argv[0]);
-    {
-        char *nptr;
-        n = strtol(argv[3], &nptr, 0);
-        m = strtol(nptr+1, NULL, 0);
-        is_range = *nptr == '-';
-        if(!n || !m || (*nptr != '-' && *nptr != '/'))
-            die("Couldn't parse pattern %s\n", argv[3]);
+  exec_name = argv[0];
+
+  if (argc != 4)
+    die("Invalid number of arguments");
+
+  if (!(infile = fopen(argv[1], "rb")))
+    die("Failed to open %s for reading", argv[1]);
+
+  if (!(outfile = fopen(argv[2], "wb")))
+    die("Failed to open %s for writing", argv[2]);
+
+  n = strtol(argv[3], &nptr, 0);
+  m = strtol(nptr + 1, NULL, 0);
+  is_range = (*nptr == '-');
+  if (!n || !m || (*nptr != '-' && *nptr != '/'))
+    die("Couldn't parse pattern %s\n", argv[3]);
+
+  video = vpx_video_open_file(infile);
+  if (!video)
+    die("%s is not a supported input file.", argv[1]);
+
+  iface = get_codec_interface(vpx_video_get_fourcc(video));
+  if (!iface)
+    die("Unknown FOURCC code.");
+
+  printf("Using %s\n", vpx_codec_iface_name(iface));
+
+  if (vpx_codec_dec_init(&codec, iface, NULL, flags))
+    die_codec(&codec, "Failed to initialize decoder");
+
+  while (vpx_video_read_frame(video)) {
+    vpx_codec_iter_t iter = NULL;
+    vpx_image_t *img = NULL;
+    size_t frame_size = 0;
+    int skip;
+    const unsigned char *frame = vpx_video_get_frame(video, &frame_size);
+    if (vpx_codec_decode(&codec, frame, frame_size, NULL, 0))
+      die_codec(&codec, "Failed to decode frame");
+
+    ++frame_cnt;
+
+    skip = (is_range && frame_cnt >= n && frame_cnt <= m) ||
+           (!is_range && m - (frame_cnt - 1) % m <= n);
+
+    if (!skip) {
+      putc('.', stdout);
+
+      while ((img = vpx_codec_get_frame(&codec, &iter)) != NULL)
+        vpx_img_write(img, outfile);
+    } else {
+      putc('X', stdout);
     }
-    if(!(infile = fopen(argv[1], "rb")))
-        die("Failed to open %s for reading", argv[1]);
-    if(!(outfile = fopen(argv[2], "wb")))
-        die("Failed to open %s for writing", argv[2]);
 
-    /* Read file header */
-    if(!(fread(file_hdr, 1, IVF_FILE_HDR_SZ, infile) == IVF_FILE_HDR_SZ
-         && file_hdr[0]=='D' && file_hdr[1]=='K' && file_hdr[2]=='I'
-         && file_hdr[3]=='F'))
-        die("%s is not an IVF file.", argv[1]);
+    fflush(stdout);
+  }
 
-    printf("Using %s\n",vpx_codec_iface_name(interface));
-    /* Initialize codec */
-    if(vpx_codec_dec_init(&codec, interface, NULL, flags))
-        die_codec(&codec, "Failed to initialize decoder");
+  printf("Processed %d frames.\n", frame_cnt);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec");
 
-    /* Read each frame */
-    while(fread(frame_hdr, 1, IVF_FRAME_HDR_SZ, infile) == IVF_FRAME_HDR_SZ) {
-        int               frame_sz = mem_get_le32(frame_hdr);
-        vpx_codec_iter_t  iter = NULL;
-        vpx_image_t      *img;
+  printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n",
+         vpx_video_get_width(video), vpx_video_get_height(video), argv[2]);
 
+  vpx_video_close(video);
 
-        frame_cnt++;
-        if(frame_sz > sizeof(frame))
-            die("Frame %d data too big for example code buffer", frame_sz);
-        if(fread(frame, 1, frame_sz, infile) != frame_sz)
-            die("Frame %d failed to read complete frame", frame_cnt);
+  fclose(outfile);
+  fclose(infile);
 
-        if((is_range && frame_cnt >= n && frame_cnt <= m)
-           ||(!is_range && m - (frame_cnt-1)%m <= n)) {
-           putc('X', stdout);
-           continue;
-        }
-        putc('.', stdout);
-        fflush(stdout);
-        /* Decode the frame */
-        if(vpx_codec_decode(&codec, frame, frame_sz, NULL, 0))
-            die_codec(&codec, "Failed to decode frame");
-
-        /* Write decoded data to disk */
-        while((img = vpx_codec_get_frame(&codec, &iter))) {
-            unsigned int plane, y;
-
-            for(plane=0; plane < 3; plane++) {
-                unsigned char *buf =img->planes[plane];
-            
-                for(y=0; y < (plane ? (img->d_h + 1) >> 1 : img->d_h); y++) {
-                    (void) fwrite(buf, 1, (plane ? (img->d_w + 1) >> 1 : img->d_w),
-                                  outfile);
-                    buf += img->stride[plane];
-                }
-            }
-        }
-    }
-    printf("Processed %d frames.\n",frame_cnt);
-    if(vpx_codec_destroy(&codec))
-        die_codec(&codec, "Failed to destroy codec");
-
-    fclose(outfile);
-    fclose(infile);
-    return EXIT_SUCCESS;
+  return EXIT_SUCCESS;
 }
diff --git a/examples/postproc.c b/examples/postproc.c
index 0d0d6b5..4ec2d1f 100644
--- a/examples/postproc.c
+++ b/examples/postproc.c
@@ -39,130 +39,105 @@
 // postprocessors. VP8 is one example. The following sample code toggles
 // postprocessing on and off every 15 frames.
 
-#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
+#include "./ivfdec.h"
+
 #define VPX_CODEC_DISABLE_COMPAT 1
-#include "./vpx_config.h"
+
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
-#define interface (vpx_codec_vp8_dx())
 
+#include "./tools_common.h"
+#include "./vpx_config.h"
 
-#define IVF_FILE_HDR_SZ  (32)
-#define IVF_FRAME_HDR_SZ (12)
+static const char *exec_name;
 
-static unsigned int mem_get_le32(const unsigned char *mem) {
-    return (mem[3] << 24)|(mem[2] << 16)|(mem[1] << 8)|(mem[0]);
+void usage_exit() {
+  fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name);
+  exit(EXIT_FAILURE);
 }
 
-static void die(const char *fmt, ...) {
-    va_list ap;
-
-    va_start(ap, fmt);
-    vprintf(fmt, ap);
-    if(fmt[strlen(fmt)-1] != '\n')
-        printf("\n");
-    exit(EXIT_FAILURE);
-}
-
-static void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
-    const char *detail = vpx_codec_error_detail(ctx);
-
-    printf("%s: %s\n", s, vpx_codec_error(ctx));
-    if(detail)
-        printf("    %s\n",detail);
-    exit(EXIT_FAILURE);
-}
-
-
 int main(int argc, char **argv) {
-    FILE            *infile, *outfile;
-    vpx_codec_ctx_t  codec;
-    int              flags = 0, frame_cnt = 0;
-    unsigned char    file_hdr[IVF_FILE_HDR_SZ];
-    unsigned char    frame_hdr[IVF_FRAME_HDR_SZ];
-    unsigned char    frame[256*1024];
-    vpx_codec_err_t  res;
+  FILE *infile, *outfile;
+  vpx_codec_ctx_t codec;
+  vpx_codec_iface_t *iface;
+  int frame_cnt = 0;
+  vpx_video_t *video;
+  vpx_codec_err_t res;
 
-    (void)res;
-    /* Open files */
-    if(argc!=3)
-        die("Usage: %s <infile> <outfile>\n", argv[0]);
-    if(!(infile = fopen(argv[1], "rb")))
-        die("Failed to open %s for reading", argv[1]);
-    if(!(outfile = fopen(argv[2], "wb")))
-        die("Failed to open %s for writing", argv[2]);
+  exec_name = argv[0];
 
-    /* Read file header */
-    if(!(fread(file_hdr, 1, IVF_FILE_HDR_SZ, infile) == IVF_FILE_HDR_SZ
-         && file_hdr[0]=='D' && file_hdr[1]=='K' && file_hdr[2]=='I'
-         && file_hdr[3]=='F'))
-        die("%s is not an IVF file.", argv[1]);
+  if (argc != 3)
+    die("Invalid number of arguments");
 
-    printf("Using %s\n",vpx_codec_iface_name(interface));
-    /* Initialize codec */
-    res = vpx_codec_dec_init(&codec, interface, NULL,
-                             VPX_CODEC_USE_POSTPROC);
-    if(res == VPX_CODEC_INCAPABLE) {
-        printf("NOTICE: Postproc not supported by %s\n",
-               vpx_codec_iface_name(interface));
-        res = vpx_codec_dec_init(&codec, interface, NULL, flags);
+  if (!(infile = fopen(argv[1], "rb")))
+    die("Failed to open %s for reading", argv[1]);
+
+  if (!(outfile = fopen(argv[2], "wb")))
+    die("Failed to open %s for writing", argv[2]);
+
+  video = vpx_video_open_file(infile);
+  if (!video)
+    die("%s is not a supported input file.", argv[1]);
+
+  iface = get_codec_interface(vpx_video_get_fourcc(video));
+  if (!iface)
+    die("Unknown FOURCC code.");
+
+  printf("Using %s\n", vpx_codec_iface_name(iface));
+
+
+  res = vpx_codec_dec_init(&codec, iface, NULL, VPX_CODEC_USE_POSTPROC);
+  if (res == VPX_CODEC_INCAPABLE) {
+    printf("NOTICE: Postproc not supported.\n");
+    res = vpx_codec_dec_init(&codec, iface, NULL, 0);
+  }
+
+  if (res)
+    die_codec(&codec, "Failed to initialize decoder");
+
+  while (vpx_video_read_frame(video)) {
+    vpx_codec_iter_t iter = NULL;
+    vpx_image_t *img = NULL;
+    size_t frame_size = 0;
+    const unsigned char *frame = vpx_video_get_frame(video, &frame_size);
+
+    ++frame_cnt;
+
+    if (frame_cnt % 30 == 1) {
+      vp8_postproc_cfg_t pp = {0, 0, 0};
+
+    if (vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
+      die_codec(&codec, "Failed to turn off postproc");
+    } else if (frame_cnt % 30 == 16) {
+      vp8_postproc_cfg_t pp = {VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE,
+                               4, 0};
+      if (vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
+        die_codec(&codec, "Failed to turn on postproc");
+    };
+
+    // Decode the frame with 15ms deadline
+    if (vpx_codec_decode(&codec, frame, frame_size, NULL, 15000))
+      die_codec(&codec, "Failed to decode frame");
+
+    while ((img = vpx_codec_get_frame(&codec, &iter)) != NULL) {
+      vpx_img_write(img, outfile);
     }
-    if(res)
-        die_codec(&codec, "Failed to initialize decoder");
+  }
 
-    /* Read each frame */
-    while(fread(frame_hdr, 1, IVF_FRAME_HDR_SZ, infile) == IVF_FRAME_HDR_SZ) {
-        int               frame_sz = mem_get_le32(frame_hdr);
-        vpx_codec_iter_t  iter = NULL;
-        vpx_image_t      *img;
+  printf("Processed %d frames.\n", frame_cnt);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec");
 
+  printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n",
+         vpx_video_get_width(video), vpx_video_get_height(video), argv[2]);
 
-        frame_cnt++;
-        if(frame_sz > sizeof(frame))
-            die("Frame %d data too big for example code buffer", frame_sz);
-        if(fread(frame, 1, frame_sz, infile) != frame_sz)
-            die("Frame %d failed to read complete frame", frame_cnt);
+  vpx_video_close(video);
 
-        #if CONFIG_VP9_DECODER
-        if(frame_cnt%30 == 1) {
-            vp8_postproc_cfg_t  pp = {0, 0, 0};
-
-            if(vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
-                die_codec(&codec, "Failed to turn off postproc");
-        } else if(frame_cnt%30 == 16) {
-            vp8_postproc_cfg_t  pp = {VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE, 4, 0};
-
-            if(vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
-                die_codec(&codec, "Failed to turn on postproc");
-        };
-        #endif
-        /* Decode the frame with 15ms deadline */
-        if(vpx_codec_decode(&codec, frame, frame_sz, NULL, 15000))
-            die_codec(&codec, "Failed to decode frame");
-
-        /* Write decoded data to disk */
-        while((img = vpx_codec_get_frame(&codec, &iter))) {
-            unsigned int plane, y;
-
-            for(plane=0; plane < 3; plane++) {
-                unsigned char *buf =img->planes[plane];
-            
-                for(y=0; y < (plane ? (img->d_h + 1) >> 1 : img->d_h); y++) {
-                    (void) fwrite(buf, 1, (plane ? (img->d_w + 1) >> 1 : img->d_w),
-                                  outfile);
-                    buf += img->stride[plane];
-                }
-            }
-        }
-    }
-    printf("Processed %d frames.\n",frame_cnt);
-    if(vpx_codec_destroy(&codec))
-        die_codec(&codec, "Failed to destroy codec");
-
-    fclose(outfile);
-    fclose(infile);
-    return EXIT_SUCCESS;
+  fclose(outfile);
+  fclose(infile);
+  return EXIT_SUCCESS;
 }
diff --git a/rate_hist.c b/rate_hist.c
new file mode 100644
index 0000000..1cef19b
--- /dev/null
+++ b/rate_hist.c
@@ -0,0 +1,282 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "./rate_hist.h"
+
+#define RATE_BINS 100
+#define HIST_BAR_MAX 40
+
+struct hist_bucket {
+  int low;
+  int high;
+  int count;
+};
+
+struct rate_hist {
+  int64_t *pts;
+  int *sz;
+  int samples;
+  int frames;
+  struct hist_bucket bucket[RATE_BINS];
+  int total;
+};
+
+struct rate_hist *init_rate_histogram(const vpx_codec_enc_cfg_t *cfg,
+                                      const vpx_rational_t *fps) {
+  int i;
+  struct rate_hist *hist = malloc(sizeof(*hist));
+
+  // Determine the number of samples in the buffer. Use the file's framerate
+  // to determine the number of frames in rc_buf_sz milliseconds, with an
+  // adjustment (5/4) to account for alt-refs
+  hist->samples = cfg->rc_buf_sz * 5 / 4 * fps->num / fps->den / 1000;
+
+  // prevent division by zero
+  if (hist->samples == 0)
+    hist->samples = 1;
+
+  hist->frames = 0;
+  hist->total = 0;
+
+  hist->pts = calloc(hist->samples, sizeof(*hist->pts));
+  hist->sz = calloc(hist->samples, sizeof(*hist->sz));
+  for (i = 0; i < RATE_BINS; i++) {
+    hist->bucket[i].low = INT_MAX;
+    hist->bucket[i].high = 0;
+    hist->bucket[i].count = 0;
+  }
+
+  return hist;
+}
+
+void destroy_rate_histogram(struct rate_hist *hist) {
+  if (hist) {
+    free(hist->pts);
+    free(hist->sz);
+    free(hist);
+  }
+}
+
+void update_rate_histogram(struct rate_hist *hist,
+                           const vpx_codec_enc_cfg_t *cfg,
+                           const vpx_codec_cx_pkt_t *pkt) {
+  int i;
+  int64_t then = 0;
+  int64_t avg_bitrate = 0;
+  int64_t sum_sz = 0;
+  const int64_t now = pkt->data.frame.pts * 1000 *
+                          (uint64_t)cfg->g_timebase.num /
+                              (uint64_t)cfg->g_timebase.den;
+
+  int idx = hist->frames++ % hist->samples;
+  hist->pts[idx] = now;
+  hist->sz[idx] = (int)pkt->data.frame.sz;
+
+  if (now < cfg->rc_buf_initial_sz)
+    return;
+
+  then = now;
+
+  /* Sum the size over the past rc_buf_sz ms */
+  for (i = hist->frames; i > 0 && hist->frames - i < hist->samples; i--) {
+    const int i_idx = (i - 1) % hist->samples;
+
+    then = hist->pts[i_idx];
+    if (now - then > cfg->rc_buf_sz)
+      break;
+    sum_sz += hist->sz[i_idx];
+  }
+
+  if (now == then)
+    return;
+
+  avg_bitrate = sum_sz * 8 * 1000 / (now - then);
+  idx = (int)(avg_bitrate * (RATE_BINS / 2) / (cfg->rc_target_bitrate * 1000));
+  if (idx < 0)
+    idx = 0;
+  if (idx > RATE_BINS - 1)
+    idx = RATE_BINS - 1;
+  if (hist->bucket[idx].low > avg_bitrate)
+    hist->bucket[idx].low = (int)avg_bitrate;
+  if (hist->bucket[idx].high < avg_bitrate)
+    hist->bucket[idx].high = (int)avg_bitrate;
+  hist->bucket[idx].count++;
+  hist->total++;
+}
+
+static int merge_hist_buckets(struct hist_bucket *bucket,
+                              int max_buckets, int *num_buckets) {
+  int small_bucket = 0, merge_bucket = INT_MAX, big_bucket = 0;
+  int buckets = *num_buckets;
+  int i;
+
+  /* Find the extrema for this list of buckets */
+  big_bucket = small_bucket = 0;
+  for (i = 0; i < buckets; i++) {
+    if (bucket[i].count < bucket[small_bucket].count)
+      small_bucket = i;
+    if (bucket[i].count > bucket[big_bucket].count)
+      big_bucket = i;
+  }
+
+  /* If we have too many buckets, merge the smallest with an adjacent
+   * bucket.
+   */
+  while (buckets > max_buckets) {
+    int last_bucket = buckets - 1;
+
+    /* merge the small bucket with an adjacent one. */
+    if (small_bucket == 0)
+      merge_bucket = 1;
+    else if (small_bucket == last_bucket)
+      merge_bucket = last_bucket - 1;
+    else if (bucket[small_bucket - 1].count < bucket[small_bucket + 1].count)
+      merge_bucket = small_bucket - 1;
+    else
+      merge_bucket = small_bucket + 1;
+
+    assert(abs(merge_bucket - small_bucket) <= 1);
+    assert(small_bucket < buckets);
+    assert(big_bucket < buckets);
+    assert(merge_bucket < buckets);
+
+    if (merge_bucket < small_bucket) {
+      bucket[merge_bucket].high = bucket[small_bucket].high;
+      bucket[merge_bucket].count += bucket[small_bucket].count;
+    } else {
+      bucket[small_bucket].high = bucket[merge_bucket].high;
+      bucket[small_bucket].count += bucket[merge_bucket].count;
+      merge_bucket = small_bucket;
+    }
+
+    assert(bucket[merge_bucket].low != bucket[merge_bucket].high);
+
+    buckets--;
+
+    /* Remove the merge_bucket from the list, and find the new small
+     * and big buckets while we're at it
+     */
+    big_bucket = small_bucket = 0;
+    for (i = 0; i < buckets; i++) {
+      if (i > merge_bucket)
+        bucket[i] = bucket[i + 1];
+
+      if (bucket[i].count < bucket[small_bucket].count)
+        small_bucket = i;
+      if (bucket[i].count > bucket[big_bucket].count)
+        big_bucket = i;
+    }
+  }
+
+  *num_buckets = buckets;
+  return bucket[big_bucket].count;
+}
+
+static void show_histogram(const struct hist_bucket *bucket,
+                           int buckets, int total, int scale) {
+  const char *pat1, *pat2;
+  int i;
+
+  switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) {
+    case 1:
+    case 2:
+      pat1 = "%4d %2s: ";
+      pat2 = "%4d-%2d: ";
+      break;
+    case 3:
+      pat1 = "%5d %3s: ";
+      pat2 = "%5d-%3d: ";
+      break;
+    case 4:
+      pat1 = "%6d %4s: ";
+      pat2 = "%6d-%4d: ";
+      break;
+    case 5:
+      pat1 = "%7d %5s: ";
+      pat2 = "%7d-%5d: ";
+      break;
+    case 6:
+      pat1 = "%8d %6s: ";
+      pat2 = "%8d-%6d: ";
+      break;
+    case 7:
+      pat1 = "%9d %7s: ";
+      pat2 = "%9d-%7d: ";
+      break;
+    default:
+      pat1 = "%12d %10s: ";
+      pat2 = "%12d-%10d: ";
+      break;
+  }
+
+  for (i = 0; i < buckets; i++) {
+    int len;
+    int j;
+    float pct;
+
+    pct = (float)(100.0 * bucket[i].count / total);
+    len = HIST_BAR_MAX * bucket[i].count / scale;
+    if (len < 1)
+      len = 1;
+    assert(len <= HIST_BAR_MAX);
+
+    if (bucket[i].low == bucket[i].high)
+      fprintf(stderr, pat1, bucket[i].low, "");
+    else
+      fprintf(stderr, pat2, bucket[i].low, bucket[i].high);
+
+    for (j = 0; j < HIST_BAR_MAX; j++)
+      fprintf(stderr, j < len ? "=" : " ");
+    fprintf(stderr, "\t%5d (%6.2f%%)\n", bucket[i].count, pct);
+  }
+}
+
+void show_q_histogram(const int counts[64], int max_buckets) {
+  struct hist_bucket bucket[64];
+  int buckets = 0;
+  int total = 0;
+  int scale;
+  int i;
+
+  for (i = 0; i < 64; i++) {
+    if (counts[i]) {
+      bucket[buckets].low = bucket[buckets].high = i;
+      bucket[buckets].count = counts[i];
+      buckets++;
+      total += counts[i];
+    }
+  }
+
+  fprintf(stderr, "\nQuantizer Selection:\n");
+  scale = merge_hist_buckets(bucket, max_buckets, &buckets);
+  show_histogram(bucket, buckets, total, scale);
+}
+
+void show_rate_histogram(struct rate_hist *hist,
+                         const vpx_codec_enc_cfg_t *cfg, int max_buckets) {
+  int i, scale;
+  int buckets = 0;
+
+  for (i = 0; i < RATE_BINS; i++) {
+    if (hist->bucket[i].low == INT_MAX)
+      continue;
+    hist->bucket[buckets++] = hist->bucket[i];
+  }
+
+  fprintf(stderr, "\nRate (over %dms window):\n", cfg->rc_buf_sz);
+  scale = merge_hist_buckets(hist->bucket, max_buckets, &buckets);
+  show_histogram(hist->bucket, buckets, hist->total, scale);
+}
diff --git a/rate_hist.h b/rate_hist.h
new file mode 100644
index 0000000..00a1676
--- /dev/null
+++ b/rate_hist.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef RATE_HIST_H_
+#define RATE_HIST_H_
+
+#include "vpx/vpx_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct rate_hist;
+
+struct rate_hist *init_rate_histogram(const vpx_codec_enc_cfg_t *cfg,
+                                      const vpx_rational_t *fps);
+
+void destroy_rate_histogram(struct rate_hist *hist);
+
+void update_rate_histogram(struct rate_hist *hist,
+                           const vpx_codec_enc_cfg_t *cfg,
+                           const vpx_codec_cx_pkt_t *pkt);
+
+void show_q_histogram(const int counts[64], int max_buckets);
+
+void show_rate_histogram(struct rate_hist *hist, const vpx_codec_enc_cfg_t *cfg,
+                         int max_buckets);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // RATE_HIST_H_
diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c
index 9f400e9..dbde6d5 100644
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
@@ -106,6 +106,7 @@
 };
 
 const interp_kernel *vp9_get_interp_kernel(INTERP_FILTER filter) {
+  assert(filter != SWITCHABLE);
   return vp9_filter_kernels[filter];
 }
 
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index f954236..8a2297f 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -23,6 +23,68 @@
   const short *filter
 );
 
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
+                                  uint8_t *dst, ptrdiff_t dst_stride, \
+                                  const int16_t *filter_x, int x_step_q4, \
+                                  const int16_t *filter_y, int y_step_q4, \
+                                  int w, int h) { \
+  if (step_q4 == 16 && filter[3] != 128) { \
+    while (w >= 16) { \
+      vp9_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, \
+                                               dst, dst_stride, \
+                                               h, filter); \
+      src += 16; \
+      dst += 16; \
+      w -= 16; \
+    } \
+    while (w >= 8) { \
+      vp9_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, \
+                                              dst, dst_stride, \
+                                              h, filter); \
+      src += 8; \
+      dst += 8; \
+      w -= 8; \
+    } \
+    while (w >= 4) { \
+      vp9_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, \
+                                              dst, dst_stride, \
+                                              h, filter); \
+      src += 4; \
+      dst += 4; \
+      w -= 4; \
+    } \
+  } \
+  if (w) { \
+    vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
+                             filter_x, x_step_q4, filter_y, y_step_q4, \
+                             w, h); \
+  } \
+}
+
+#define FUN_CONV_2D(avg, opt) \
+void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+                              uint8_t *dst, ptrdiff_t dst_stride, \
+                              const int16_t *filter_x, int x_step_q4, \
+                              const int16_t *filter_y, int y_step_q4, \
+                              int w, int h) { \
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \
+  \
+  assert(w <= 64); \
+  assert(h <= 64); \
+  if (x_step_q4 == 16 && y_step_q4 == 16) { \
+    vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+                              filter_x, x_step_q4, filter_y, y_step_q4, \
+                              w, h + 7); \
+    vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+                                    filter_x, x_step_q4, filter_y, y_step_q4, \
+                                    w, h); \
+  } else { \
+    vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+                           filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
+  } \
+}
+
 #if HAVE_SSSE3
 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
@@ -37,201 +99,44 @@
 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
 
-void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  /* Ensure the filter can be compressed to int16_t. */
-  if (x_step_q4 == 16 && filter_x[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_h8_ssse3(src, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_x);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_h8_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_h8_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
-                          filter_x, x_step_q4, filter_y, y_step_q4,
-                          w, h);
-  }
-}
+// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                    uint8_t *dst, ptrdiff_t dst_stride,
+//                                    const int16_t *filter_x, int x_step_q4,
+//                                    const int16_t *filter_y, int y_step_q4,
+//                                    int w, int h);
+// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                   uint8_t *dst, ptrdiff_t dst_stride,
+//                                   const int16_t *filter_x, int x_step_q4,
+//                                   const int16_t *filter_y, int y_step_q4,
+//                                   int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+            ssse3);
 
-void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  if (y_step_q4 == 16 && filter_y[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_y);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
-                         filter_x, x_step_q4, filter_y, y_step_q4,
-                         w, h);
-  }
-}
-
-void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  if (x_step_q4 == 16 && filter_x[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_h8_avg_ssse3(src, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_x);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_h8_avg_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_h8_avg_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h);
-  }
-}
-
-void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  if (y_step_q4 == 16 && filter_y[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_y);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
-                             filter_x, x_step_q4, filter_y, y_step_q4,
-                             w, h);
-  }
-}
-
-void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
-
-  assert(w <= 64);
-  assert(h <= 64);
-  if (x_step_q4 == 16 && y_step_q4 == 16) {
-    vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h + 7);
-    vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
-                             filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  } else {
-    vp9_convolve8_c(src, src_stride, dst, dst_stride,
-                    filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  }
-}
-
-void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
-
-  assert(w <= 64);
-  assert(h <= 64);
-  if (x_step_q4 == 16 && y_step_q4 == 16) {
-    vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h + 7);
-    vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
-                                 filter_x, x_step_q4, filter_y, y_step_q4,
-                                 w, h);
-  } else {
-    vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
-                        filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  }
-}
+// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                          uint8_t *dst, ptrdiff_t dst_stride,
+//                          const int16_t *filter_x, int x_step_q4,
+//                          const int16_t *filter_y, int y_step_q4,
+//                          int w, int h);
+// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const int16_t *filter_x, int x_step_q4,
+//                              const int16_t *filter_y, int y_step_q4,
+//                              int w, int h);
+FUN_CONV_2D(, ssse3);
+FUN_CONV_2D(avg_ , ssse3);
 #endif
 
 #if HAVE_SSE2
@@ -248,199 +153,41 @@
 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
 
-void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  /* Ensure the filter can be compressed to int16_t. */
-  if (x_step_q4 == 16 && filter_x[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_h8_sse2(src, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_x);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_h8_sse2(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_h8_sse2(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
-                          filter_x, x_step_q4, filter_y, y_step_q4,
-                          w, h);
-  }
-}
+// void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+// void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const int16_t *filter_x, int x_step_q4,
+//                              const int16_t *filter_y, int y_step_q4,
+//                              int w, int h);
+// void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                   uint8_t *dst, ptrdiff_t dst_stride,
+//                                   const int16_t *filter_x, int x_step_q4,
+//                                   const int16_t *filter_y, int y_step_q4,
+//                                   int w, int h);
+// void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                  uint8_t *dst, ptrdiff_t dst_stride,
+//                                  const int16_t *filter_x, int x_step_q4,
+//                                  const int16_t *filter_y, int y_step_q4,
+//                                  int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
 
-void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  if (y_step_q4 == 16 && filter_y[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_y);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
-                         filter_x, x_step_q4, filter_y, y_step_q4,
-                         w, h);
-  }
-}
-
-void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  if (x_step_q4 == 16 && filter_x[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_h8_avg_sse2(src, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_x);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_h8_avg_sse2(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_h8_avg_sse2(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h);
-  }
-}
-
-void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  if (y_step_q4 == 16 && filter_y[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_y);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
-                             filter_x, x_step_q4, filter_y, y_step_q4,
-                             w, h);
-  }
-}
-
-void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
-
-  assert(w <= 64);
-  assert(h <= 64);
-  if (x_step_q4 == 16 && y_step_q4 == 16) {
-    vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h + 7);
-    vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
-                             filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  } else {
-    vp9_convolve8_c(src, src_stride, dst, dst_stride,
-                    filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  }
-}
-
-void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
-
-  assert(w <= 64);
-  assert(h <= 64);
-  if (x_step_q4 == 16 && y_step_q4 == 16) {
-    vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h + 7);
-    vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
-                                 filter_x, x_step_q4, filter_y, y_step_q4,
-                                 w, h);
-  } else {
-    vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
-                        filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  }
-}
+// void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                         uint8_t *dst, ptrdiff_t dst_stride,
+//                         const int16_t *filter_x, int x_step_q4,
+//                         const int16_t *filter_y, int y_step_q4,
+//                         int w, int h);
+// void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                             uint8_t *dst, ptrdiff_t dst_stride,
+//                             const int16_t *filter_x, int x_step_q4,
+//                             const int16_t *filter_y, int y_step_q4,
+//                             int w, int h);
+FUN_CONV_2D(, sse2);
+FUN_CONV_2D(avg_ , sse2);
 #endif
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 70a88ab..6a8a34c 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -2690,7 +2690,7 @@
     }
   } else {
     set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+    xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
 
     if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
       // Adjust the zbin based on this MB rate.
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 3b641a1..4488189 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -25,13 +25,6 @@
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
-void vp9_setup_interp_filters(MACROBLOCKD *xd, INTERP_FILTER filter,
-                              VP9_COMMON *cm) {
-  xd->interp_kernel = vp9_get_interp_kernel(filter == SWITCHABLE ? EIGHTTAP
-                                                                 : filter);
-  assert(((intptr_t)xd->interp_kernel & 0xff) == 0);
-}
-
 void vp9_subtract_block_c(int rows, int cols,
                           int16_t *diff_ptr, ptrdiff_t diff_stride,
                           const uint8_t *src_ptr, ptrdiff_t src_stride,
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index c728efd..9f6c9f0 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -48,8 +48,7 @@
 void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize);
 
 int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred);
-void vp9_setup_interp_filters(MACROBLOCKD *xd, INTERP_FILTER filter,
-                              VP9_COMMON *cm);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 28b343c..af9fa1b 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -491,10 +491,7 @@
   int sum_in_vectors = 0;
   uint32_t lastmv_as_int = 0;
   struct twopass_rc *const twopass = &cpi->twopass;
-
-  int_mv zero_ref_mv;
-
-  zero_ref_mv.as_int = 0;
+  const MV zero_mv = {0, 0};
 
   vp9_clear_system_state();  // __asm emms;
 
@@ -503,8 +500,7 @@
   setup_dst_planes(xd, new_yv12, 0, 0);
 
   xd->mi_8x8 = cm->mi_grid_visible;
-  // required for vp9_frame_init_quantizer
-  xd->mi_8x8[0] = cm->mi;
+  xd->mi_8x8[0] = cm->mi;  // required for vp9_frame_init_quantizer
 
   setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
@@ -518,14 +514,8 @@
   }
   x->skip_recode = 0;
 
-
-  // Initialise the MV cost table to the defaults
-  // if( cm->current_video_frame == 0)
-  // if ( 0 )
-  {
-    vp9_init_mv_probs(cm);
-    vp9_initialize_rd_consts(cpi);
-  }
+  vp9_init_mv_probs(cm);
+  vp9_initialize_rd_consts(cpi);
 
   // tiling is ignored in the first pass
   vp9_tile_init(&tile, cm, 0, 0);
@@ -550,7 +540,7 @@
     // for each macroblock col in image
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
       int this_error;
-      int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+      const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
       double error_weight = 1.0;
       const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
 
@@ -568,7 +558,7 @@
                      cm->mi_rows, cm->mi_cols);
 
       if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-        int energy = vp9_block_energy(cpi, x, bsize);
+        const int energy = vp9_block_energy(cpi, x, bsize);
         error_weight = vp9_vaq_inv_q_ratio(energy);
       }
 
@@ -594,8 +584,7 @@
       // Set up limit values for motion vectors to prevent them extending
       // outside the UMV borders.
       x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
-      x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16)
-                      + BORDER_MV_PIXELS_B16;
+      x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
 
       // Other than for the first frame do a motion search
       if (cm->current_video_frame > 0) {
@@ -620,7 +609,7 @@
         // based search as well.
         if (best_ref_mv.as_int) {
           tmp_err = INT_MAX;
-          first_pass_motion_search(cpi, x, &zero_ref_mv.as_mv, &tmp_mv.as_mv,
+          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
                                    &tmp_err);
           if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
             vp9_clear_system_state();  // __asm emms;
@@ -641,17 +630,15 @@
           xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
           gf_motion_error = zz_motion_search(cpi, x);
 
-          first_pass_motion_search(cpi, x, &zero_ref_mv.as_mv, &tmp_mv.as_mv,
+          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
                                    &gf_motion_error);
           if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
             vp9_clear_system_state();  // __asm emms;
             gf_motion_error *= error_weight;
           }
 
-          if ((gf_motion_error < motion_error) &&
-              (gf_motion_error < this_error)) {
+          if (gf_motion_error < motion_error && gf_motion_error < this_error)
             second_ref_count++;
-          }
 
           // Reset to last frame as reference buffer
           xd->plane[0].pre[0].buf = lst_yv12->y_buffer + recon_yoffset;
@@ -779,13 +766,11 @@
       fps.mvr_abs = (double)sum_mvr_abs / mvcount;
       fps.MVc = (double)sum_mvc / mvcount;
       fps.mvc_abs = (double)sum_mvc_abs / mvcount;
-      fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / mvcount)) /
-                     mvcount;
-      fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / mvcount)) /
-                     mvcount;
+      fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / mvcount)) / mvcount;
+      fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / mvcount)) / mvcount;
       fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
       fps.new_mv_count = new_mv_count;
-      fps.pcnt_motion = (double)mvcount / cpi->common.MBs;
+      fps.pcnt_motion = (double)mvcount / cm->MBs;
     } else {
       fps.MVr = 0.0;
       fps.mvr_abs = 0.0;
@@ -913,8 +898,7 @@
   return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
-static int estimate_max_q(VP9_COMP *cpi,
-                          FIRSTPASS_STATS *fpstats,
+static int estimate_max_q(VP9_COMP *cpi, FIRSTPASS_STATS *fpstats,
                           int section_target_bandwitdh) {
   int q;
   const int num_mbs = cpi->common.MBs;
@@ -1088,12 +1072,12 @@
 
 // This function gives and estimate of how badly we believe
 // the prediction quality is decaying from frame to frame.
-static double get_prediction_decay_rate(VP9_COMP *cpi,
-                                        FIRSTPASS_STATS *next_frame) {
+static double get_prediction_decay_rate(const VP9_COMMON *cm,
+                                        const FIRSTPASS_STATS *next_frame) {
   // Look at the observed drop in prediction quality between the last frame
   // and the GF buffer (which contains an older frame).
   const double mb_sr_err_diff = (next_frame->sr_coded_error -
-                                     next_frame->coded_error) / cpi->common.MBs;
+                                     next_frame->coded_error) / cm->MBs;
   const double second_ref_decay = mb_sr_err_diff <= 512.0
       ? fclamp(pow(1.0 - (mb_sr_err_diff / 512.0), 0.5), 0.85, 1.0)
       : 0.85;
@@ -1121,7 +1105,6 @@
     int j;
     FIRSTPASS_STATS *position = cpi->twopass.stats_in;
     FIRSTPASS_STATS tmp_next_frame;
-    double zz_inter;
 
     // Look ahead a few frames to see if static condition
     // persists...
@@ -1129,11 +1112,10 @@
       if (EOF == input_stats(&cpi->twopass, &tmp_next_frame))
         break;
 
-      zz_inter = (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion);
-      if (zz_inter < 0.999)
+      if (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion < 0.999)
         break;
     }
-    // Reset file position
+
     reset_fpf_position(&cpi->twopass, position);
 
     // Only if it does do we signal a transition to still
@@ -1147,14 +1129,14 @@
 // This function detects a flash through the high relative pcnt_second_ref
 // score in the frame following a flash frame. The offset passed in should
 // reflect this
-static int detect_flash(VP9_COMP *cpi, int offset) {
+static int detect_flash(const struct twopass_rc *twopass, int offset) {
   FIRSTPASS_STATS next_frame;
 
   int flash_detected = 0;
 
   // Read the frame data.
   // The return is FALSE (no flash detected) if not a valid frame
-  if (read_frame_stats(&cpi->twopass, &next_frame, offset) != EOF) {
+  if (read_frame_stats(twopass, &next_frame, offset) != EOF) {
     // What we are looking for here is a situation where there is a
     // brief break in prediction (such as a flash) but subsequent frames
     // are reasonably well predicted by an earlier (pre flash) frame.
@@ -1183,16 +1165,15 @@
   // Accumulate Motion In/Out of frame stats
   *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct;
   *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct;
-  *abs_mv_in_out_accumulator +=
-    fabs(this_frame->mv_in_out_count * motion_pct);
+  *abs_mv_in_out_accumulator += fabs(this_frame->mv_in_out_count * motion_pct);
 
   // Accumulate a measure of how uniform (or conversely how random)
   // the motion field is. (A ratio of absmv / mv)
   if (motion_pct > 0.05) {
-    double this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /
+    const double this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /
                            DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr));
 
-    double this_frame_mvc_ratio = fabs(this_frame->mvc_abs) /
+    const double this_frame_mvc_ratio = fabs(this_frame->mvc_abs) /
                            DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVc));
 
     *mv_ratio_accumulator += (this_frame_mvr_ratio < this_frame->mvr_abs)
@@ -1235,7 +1216,7 @@
                           int f_frames, int b_frames,
                           int *f_boost, int *b_boost) {
   FIRSTPASS_STATS this_frame;
-
+  struct twopass_rc *const twopass = &cpi->twopass;
   int i;
   double boost_score = 0.0;
   double mv_ratio_accumulator = 0.0;
@@ -1248,7 +1229,7 @@
 
   // Search forward from the proposed arf/next gf position
   for (i = 0; i < f_frames; i++) {
-    if (read_frame_stats(&cpi->twopass, &this_frame, (i + offset)) == EOF)
+    if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF)
       break;
 
     // Update the motion related elements to the boost calculation
@@ -1259,12 +1240,12 @@
 
     // We want to discount the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(cpi, (i + offset)) ||
-                     detect_flash(cpi, (i + offset + 1));
+    flash_detected = detect_flash(twopass, i + offset) ||
+                     detect_flash(twopass, i + offset + 1);
 
     // Cumulative effect of prediction quality decay
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator *= get_prediction_decay_rate(&cpi->common, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                           ? MIN_DECAY_FACTOR : decay_accumulator;
     }
@@ -1285,7 +1266,7 @@
 
   // Search backward towards last gf position
   for (i = -1; i >= -b_frames; i--) {
-    if (read_frame_stats(&cpi->twopass, &this_frame, (i + offset)) == EOF)
+    if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF)
       break;
 
     // Update the motion related elements to the boost calculation
@@ -1296,12 +1277,12 @@
 
     // We want to discount the the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(cpi, (i + offset)) ||
-                     detect_flash(cpi, (i + offset + 1));
+    flash_detected = detect_flash(twopass, i + offset) ||
+                     detect_flash(twopass, i + offset + 1);
 
     // Cumulative effect of prediction quality decay
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator *= get_prediction_decay_rate(&cpi->common, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                               ? MIN_DECAY_FACTOR : decay_accumulator;
     }
@@ -1461,6 +1442,7 @@
 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   FIRSTPASS_STATS next_frame = { 0 };
   FIRSTPASS_STATS *start_pos;
+  struct twopass_rc *const twopass = &cpi->twopass;
   int i;
   double boost_score = 0.0;
   double old_boost_score = 0.0;
@@ -1481,8 +1463,8 @@
   double mv_ratio_accumulator_thresh;
   int max_bits = frame_max_bits(cpi);     // Max for a single frame
 
-  unsigned int allow_alt_ref =
-    cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;
+  unsigned int allow_alt_ref = cpi->oxcf.play_alternate &&
+                               cpi->oxcf.lag_in_frames;
 
   int f_boost = 0;
   int b_boost = 0;
@@ -1490,11 +1472,11 @@
   int active_max_gf_interval;
   RATE_CONTROL *const rc = &cpi->rc;
 
-  cpi->twopass.gf_group_bits = 0;
+  twopass->gf_group_bits = 0;
 
   vp9_clear_system_state();  // __asm emms;
 
-  start_pos = cpi->twopass.stats_in;
+  start_pos = twopass->stats_in;
 
   // Load stats for the current frame.
   mod_frame_err = calculate_modified_err(cpi, this_frame);
@@ -1525,20 +1507,19 @@
     active_max_gf_interval = rc->max_gf_interval;
 
   i = 0;
-  while ((i < cpi->twopass.static_scene_max_gf_interval) &&
-         (i < rc->frames_to_key)) {
+  while (i < twopass->static_scene_max_gf_interval && i < rc->frames_to_key) {
     i++;    // Increment the loop counter
 
     // Accumulate error score of frames in this gf group
     mod_frame_err = calculate_modified_err(cpi, this_frame);
     gf_group_err += mod_frame_err;
 
-    if (EOF == input_stats(&cpi->twopass, &next_frame))
+    if (EOF == input_stats(twopass, &next_frame))
       break;
 
     // Test for the case where there is a brief flash but the prediction
     // quality back to an earlier frame is then restored.
-    flash_detected = detect_flash(cpi, 0);
+    flash_detected = detect_flash(twopass, 0);
 
     // Update the motion related elements to the boost calculation
     accumulate_frame_motion_stats(&next_frame,
@@ -1549,14 +1530,14 @@
     // Cumulative effect of prediction quality decay
     if (!flash_detected) {
       last_loop_decay_rate = loop_decay_rate;
-      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+      loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
       decay_accumulator = decay_accumulator * loop_decay_rate;
 
       // Monitor for static sections.
       if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <
           zero_motion_accumulator) {
-        zero_motion_accumulator =
-          (next_frame.pcnt_inter - next_frame.pcnt_motion);
+        zero_motion_accumulator = next_frame.pcnt_inter -
+                                      next_frame.pcnt_motion;
       }
 
       // Break clause to detect very still sections after motion
@@ -1594,14 +1575,14 @@
     old_boost_score = boost_score;
   }
 
-  cpi->twopass.gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
+  twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
 
   // Don't allow a gf too near the next kf
   if ((rc->frames_to_key - i) < MIN_GF_INTERVAL) {
     while (i < (rc->frames_to_key + !rc->next_key_frame_forced)) {
       i++;
 
-      if (EOF == input_stats(&cpi->twopass, this_frame))
+      if (EOF == input_stats(twopass, this_frame))
         break;
 
       if (i < rc->frames_to_key) {
@@ -2064,7 +2045,7 @@
 
 
       // How fast is prediction quality decaying
-      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+      loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
 
       // We want to know something about the recent past... rather than
       // as used elsewhere where we are concened with decay in prediction
@@ -2198,8 +2179,8 @@
         r = RMAX;
 
       // How fast is prediction quality decaying
-      if (!detect_flash(cpi, 0)) {
-        loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+      if (!detect_flash(twopass, 0)) {
+        loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
         decay_accumulator *= loop_decay_rate;
         decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                               ? MIN_DECAY_FACTOR : decay_accumulator;
@@ -2328,6 +2309,7 @@
       (cpi->oxcf.auto_key && (cpi->rc.frames_since_key %
                               cpi->key_frame_frequency == 0))) {
     cm->frame_type = KEY_FRAME;
+    cpi->rc.source_alt_ref_active = 0;
   } else {
     cm->frame_type = INTER_FRAME;
   }
@@ -2335,6 +2317,9 @@
   cpi->rc.baseline_gf_interval = INT_MAX;
 }
 
+// Use this macro to turn on/off use of alt-refs in one-pass mode.
+#define USE_ALTREF_FOR_ONE_PASS   1
+
 void vp9_get_one_pass_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (!cpi->refresh_alt_ref_frame &&
@@ -2346,13 +2331,20 @@
     cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 &&
                                     cpi->rc.frames_to_key == 0;
     cpi->rc.frames_to_key = cpi->key_frame_frequency;
-    cpi->rc.kf_boost = 300;
+    cpi->rc.kf_boost = 2000;
+    cpi->rc.source_alt_ref_active = 0;
   } else {
     cm->frame_type = INTER_FRAME;
   }
   if (cpi->rc.frames_till_gf_update_due == 0) {
+    cpi->rc.baseline_gf_interval = DEFAULT_GF_INTERVAL;
     cpi->rc.frames_till_gf_update_due = cpi->rc.baseline_gf_interval;
+    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+    if (cpi->rc.frames_till_gf_update_due > cpi->rc.frames_to_key)
+      cpi->rc.frames_till_gf_update_due = cpi->rc.frames_to_key;
     cpi->refresh_golden_frame = 1;
+    cpi->rc.source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
+    cpi->rc.gfu_boost = 1000;
   }
 }
 
@@ -2366,7 +2358,8 @@
     cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 &&
                                     cpi->rc.frames_to_key == 0;
     cpi->rc.frames_to_key = cpi->key_frame_frequency;
-    cpi->rc.kf_boost = 300;
+    cpi->rc.kf_boost = 2000;
+    cpi->rc.source_alt_ref_active = 0;
   } else {
     cm->frame_type = INTER_FRAME;
   }
@@ -2400,12 +2393,13 @@
   double this_frame_intra_error;
   double this_frame_coded_error;
 
-  if (cpi->refresh_alt_ref_frame) {
-    cpi->common.frame_type = INTER_FRAME;
-    return;
-  }
   if (!cpi->twopass.stats_in)
     return;
+  if (cpi->refresh_alt_ref_frame) {
+    cpi->common.frame_type = INTER_FRAME;
+    rc->per_frame_bandwidth = cpi->twopass.gf_bits;
+    return;
+  }
 
   vp9_clear_system_state();
 
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index ee73ff1..e6e59c0 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -173,7 +173,6 @@
                                             int index) {
   struct lookahead_entry *buf = NULL;
 
-  assert(index < (int)ctx->max_sz);
   if (index < (int)ctx->sz) {
     index += ctx->read_idx;
     if (index >= (int)ctx->max_sz)
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 4a0138e..256ebac 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -2956,15 +2956,6 @@
   // Clear down mmx registers to allow floating point in what follows.
   vp9_clear_system_state();
 
-  // For an alt ref frame in 2 pass we skip the call to the second
-  // pass function that sets the target bandwidth so we must set it here.
-  if (cpi->refresh_alt_ref_frame) {
-    // Set a per frame bit target for the alt ref frame.
-    cpi->rc.per_frame_bandwidth = cpi->twopass.gf_bits;
-    // Set a per second target bitrate.
-    cpi->target_bandwidth = (int)(cpi->twopass.gf_bits * cpi->output_framerate);
-  }
-
   // Clear zbin over-quant value and mode boost values.
   cpi->zbin_mode_boost = 0;
 
@@ -3296,7 +3287,6 @@
 
   vp9_get_second_pass_params(cpi);
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
-  // vp9_print_modes_and_motion_vectors(&cpi->common, "encode.stt");
 
   vp9_twopass_postencode_update(cpi, *size);
 }
@@ -3446,8 +3436,7 @@
       if (cpi->oxcf.arnr_max_frames > 0) {
         // Produce the filtered ARF frame.
         // TODO(agrange) merge these two functions.
-        configure_arnr_filter(cpi, cm->current_video_frame + frames_to_arf,
-                              cpi->rc.gfu_boost);
+        vp9_configure_arnr_filter(cpi, frames_to_arf, cpi->rc.gfu_boost);
         vp9_temporal_filter_prepare(cpi, frames_to_arf);
         vp9_extend_frame_borders(&cpi->alt_ref_buffer,
                                  cm->subsampling_x, cm->subsampling_y);
@@ -3463,7 +3452,9 @@
 #if CONFIG_MULTIPLE_ARF
       if (!cpi->multi_arf_enabled)
 #endif
-        cpi->rc.source_alt_ref_pending = 0;   // Clear Pending altf Ref flag.
+        cpi->rc.source_alt_ref_pending = 0;
+    } else {
+      cpi->rc.source_alt_ref_pending = 0;
     }
   }
 
@@ -3583,11 +3574,11 @@
   }
 
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
-  vp9_setup_interp_filters(xd, DEFAULT_INTERP_FILTER, cm);
+  xd->interp_kernel = vp9_get_interp_kernel(
+      DEFAULT_INTERP_FILTER == SWITCHABLE ? EIGHTTAP : DEFAULT_INTERP_FILTER);
 
-  if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-      vp9_vaq_init();
-  }
+  if (cpi->oxcf.aq_mode == VARIANCE_AQ)
+    vp9_vaq_init();
 
   if (cpi->use_svc) {
     SvcEncode(cpi, size, dest, frame_flags);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index ec00b3d..0a6aab9 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -8,25 +8,28 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_ENCODER_VP9_ONYX_INT_H_
 #define VP9_ENCODER_VP9_ONYX_INT_H_
 
 #include <stdio.h>
+
 #include "./vpx_config.h"
-#include "vp9/common/vp9_onyx.h"
-#include "vp9/encoder/vp9_treewriter.h"
-#include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/common/vp9_entropy.h"
-#include "vp9/common/vp9_entropymode.h"
 #include "vpx_ports/mem.h"
 #include "vpx/internal/vpx_codec_internal.h"
-#include "vp9/encoder/vp9_mcomp.h"
+
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_onyx.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+#include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_lookahead.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_tokenize.h"
+#include "vp9/encoder/vp9_treewriter.h"
+#include "vp9/encoder/vp9_variance.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -98,18 +101,6 @@
 } FIRSTPASS_STATS;
 
 typedef struct {
-  int frames_so_far;
-  double frame_intra_error;
-  double frame_coded_error;
-  double frame_pcnt_inter;
-  double frame_pcnt_motion;
-  double frame_mvr;
-  double frame_mvr_abs;
-  double frame_mvc;
-  double frame_mvc_abs;
-} ONEPASS_FRAMESTATS;
-
-typedef struct {
   struct {
     int err;
     union {
@@ -427,67 +418,6 @@
   int use_fast_coef_updates;  // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced
 } SPEED_FEATURES;
 
-typedef struct {
-  // Rate targetting variables
-  int this_frame_target;
-  int projected_frame_size;
-  int sb64_target_rate;
-  int last_q[3];                   // Separate values for Intra/Inter/ARF-GF
-  int last_boosted_qindex;         // Last boosted GF/KF/ARF q
-
-  int gfu_boost;
-  int last_boost;
-  int kf_boost;
-
-  double rate_correction_factor;
-  double key_frame_rate_correction_factor;
-  double gf_rate_correction_factor;
-
-  unsigned int frames_since_golden;
-  unsigned int frames_till_gf_update_due;  // Count down till next GF
-  unsigned int max_gf_interval;
-  unsigned int baseline_gf_interval;
-  unsigned int frames_to_key;
-  unsigned int frames_since_key;
-  unsigned int this_key_frame_forced;
-  unsigned int next_key_frame_forced;
-  unsigned int source_alt_ref_pending;
-  unsigned int source_alt_ref_active;
-  unsigned int is_src_frame_alt_ref;
-
-  int per_frame_bandwidth;        // Current section per frame bandwidth target
-  int av_per_frame_bandwidth;     // Average frame size target for clip
-  int min_frame_bandwidth;        // Minimum allocation used for any frame
-  int max_frame_bandwidth;        // Maximum burst rate allowed for a frame.
-
-  int ni_av_qi;
-  int ni_tot_qi;
-  int ni_frames;
-  int avg_frame_qindex[3];  // 0 - KEY, 1 - INTER, 2 - ARF/GF
-  double tot_q;
-  double avg_q;
-
-  int buffer_level;
-  int bits_off_target;
-
-  int decimation_factor;
-  int decimation_count;
-
-  int rolling_target_bits;
-  int rolling_actual_bits;
-
-  int long_rolling_target_bits;
-  int long_rolling_actual_bits;
-
-  int64_t total_actual_bits;
-  int total_target_vs_actual;        // debug stats
-
-  int worst_quality;
-  int active_worst_quality;
-  int best_quality;
-  // int active_best_quality;
-} RATE_CONTROL;
-
 typedef struct VP9_COMP {
   DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
@@ -614,11 +544,6 @@
   int64_t target_bandwidth;
   struct vpx_codec_pkt_list  *output_pkt_list;
 
-#if 0
-  // Experimental code for lagged and one pass
-  ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS];
-  int one_pass_frame_index;
-#endif
   MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
   int mbgraph_n_frames;             // number of frames filled in the above
   int static_mb_pct;                // % forced skip mbs by segmentation
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 3c816a3..74eb98f 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -218,7 +218,7 @@
   vp9_clear_system_state();  // __asm emms;
 
   // For 1-pass.
-  if (cpi->pass == 0) {
+  if (cpi->pass == 0 && oxcf->end_usage == USAGE_STREAM_FROM_SERVER) {
     if (cpi->common.current_video_frame == 0) {
       target = oxcf->starting_buffer_level / 2;
     } else {
@@ -246,7 +246,7 @@
 
   if (oxcf->rc_max_intra_bitrate_pct) {
     const int max_rate = rc->per_frame_bandwidth *
-                             oxcf->rc_max_intra_bitrate_pct / 100;
+        oxcf->rc_max_intra_bitrate_pct / 100;
     target = MIN(target, max_rate);
   }
   rc->this_frame_target = target;
@@ -375,27 +375,22 @@
 static void calc_pframe_target_size(VP9_COMP *const cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   const VP9_CONFIG *const oxcf = &cpi->oxcf;
-  int min_frame_target = MAX(rc->min_frame_bandwidth,
-                             rc->av_per_frame_bandwidth >> 5);
-  if (cpi->refresh_alt_ref_frame) {
-    // Special alt reference frame case
-    // Per frame bit target for the alt ref frame
-    rc->per_frame_bandwidth = cpi->twopass.gf_bits;
-    rc->this_frame_target = rc->per_frame_bandwidth;
-  } else {
-    // Normal frames (gf and inter).
-    rc->this_frame_target = rc->per_frame_bandwidth;
-    // Set target frame size based on buffer level, for 1 pass CBR.
-    if (cpi->pass == 0 && oxcf->end_usage == USAGE_STREAM_FROM_SERVER) {
-      // Need to decide how low min_frame_target should be for 1-pass CBR.
-      // For now, use: cpi->rc.av_per_frame_bandwidth / 16:
-      min_frame_target = MAX(rc->av_per_frame_bandwidth >> 4,
-                             FRAME_OVERHEAD_BITS);
-      rc->this_frame_target = target_size_from_buffer_level(oxcf, rc);
-      // Adjust qp-max based on buffer level.
-      rc->active_worst_quality =
-          adjust_active_worst_quality_from_buffer_level(oxcf, rc);
-    }
+  int min_frame_target;
+  rc->this_frame_target = rc->per_frame_bandwidth;
+
+  if (cpi->pass == 0 && oxcf->end_usage == USAGE_STREAM_FROM_SERVER) {
+    // Need to decide how low min_frame_target should be for 1-pass CBR.
+    // For now, use: cpi->rc.av_per_frame_bandwidth / 16:
+    min_frame_target = MAX(rc->av_per_frame_bandwidth >> 4,
+                           FRAME_OVERHEAD_BITS);
+    rc->this_frame_target = target_size_from_buffer_level(oxcf, rc);
+    // Adjust qp-max based on buffer level.
+    rc->active_worst_quality =
+        adjust_active_worst_quality_from_buffer_level(oxcf, rc);
+
+    if (rc->this_frame_target < min_frame_target)
+      rc->this_frame_target = min_frame_target;
+    return;
   }
 
   // Check that the total sum of adjustments is not above the maximum allowed.
@@ -404,6 +399,9 @@
   // not capable of recovering all the extra bits we have spent in the KF or GF,
   // then the remainder will have to be recovered over a longer time span via
   // other buffer / rate control mechanisms.
+  min_frame_target = MAX(rc->min_frame_bandwidth,
+                         rc->av_per_frame_bandwidth >> 5);
+
   if (rc->this_frame_target < min_frame_target)
     rc->this_frame_target = min_frame_target;
 
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 65ddead..eba4b7a 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -12,66 +12,127 @@
 #ifndef VP9_ENCODER_VP9_RATECTRL_H_
 #define VP9_ENCODER_VP9_RATECTRL_H_
 
-#include "vp9/encoder/vp9_onyx_int.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #define FRAME_OVERHEAD_BITS 200
 
-void vp9_save_coding_context(VP9_COMP *cpi);
-void vp9_restore_coding_context(VP9_COMP *cpi);
+typedef struct {
+  // Rate targetting variables
+  int this_frame_target;
+  int projected_frame_size;
+  int sb64_target_rate;
+  int last_q[3];                   // Separate values for Intra/Inter/ARF-GF
+  int last_boosted_qindex;         // Last boosted GF/KF/ARF q
 
-void vp9_setup_key_frame(VP9_COMP *cpi);
-void vp9_setup_inter_frame(VP9_COMP *cpi);
+  int gfu_boost;
+  int last_boost;
+  int kf_boost;
+
+  double rate_correction_factor;
+  double key_frame_rate_correction_factor;
+  double gf_rate_correction_factor;
+
+  unsigned int frames_since_golden;
+  unsigned int frames_till_gf_update_due;  // Count down till next GF
+  unsigned int max_gf_interval;
+  unsigned int baseline_gf_interval;
+  unsigned int frames_to_key;
+  unsigned int frames_since_key;
+  unsigned int this_key_frame_forced;
+  unsigned int next_key_frame_forced;
+  unsigned int source_alt_ref_pending;
+  unsigned int source_alt_ref_active;
+  unsigned int is_src_frame_alt_ref;
+
+  int per_frame_bandwidth;        // Current section per frame bandwidth target
+  int av_per_frame_bandwidth;     // Average frame size target for clip
+  int min_frame_bandwidth;        // Minimum allocation used for any frame
+  int max_frame_bandwidth;        // Maximum burst rate allowed for a frame.
+
+  int ni_av_qi;
+  int ni_tot_qi;
+  int ni_frames;
+  int avg_frame_qindex[3];  // 0 - KEY, 1 - INTER, 2 - ARF/GF
+  double tot_q;
+  double avg_q;
+
+  int buffer_level;
+  int bits_off_target;
+
+  int decimation_factor;
+  int decimation_count;
+
+  int rolling_target_bits;
+  int rolling_actual_bits;
+
+  int long_rolling_target_bits;
+  int long_rolling_actual_bits;
+
+  int64_t total_actual_bits;
+  int total_target_vs_actual;        // debug stats
+
+  int worst_quality;
+  int active_worst_quality;
+  int best_quality;
+  // int active_best_quality;
+} RATE_CONTROL;
+
+struct VP9_COMP;
+
+void vp9_save_coding_context(struct VP9_COMP *cpi);
+void vp9_restore_coding_context(struct VP9_COMP *cpi);
+
+void vp9_setup_key_frame(struct VP9_COMP *cpi);
+void vp9_setup_inter_frame(struct VP9_COMP *cpi);
 
 double vp9_convert_qindex_to_q(int qindex);
 
 // Updates rate correction factors
-void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);
+void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi, int damp_var);
 
 // initialize luts for minq
 void vp9_rc_init_minq_luts(void);
 
 // return of 0 means drop frame
 // Changes only rc.this_frame_target and rc.sb64_rate_target
-int vp9_rc_pick_frame_size_target(VP9_COMP *cpi);
+int vp9_rc_pick_frame_size_target(struct VP9_COMP *cpi);
 
-void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi,
+void vp9_rc_compute_frame_size_bounds(const struct VP9_COMP *cpi,
                                       int this_frame_target,
                                       int *frame_under_shoot_limit,
                                       int *frame_over_shoot_limit);
 
 // Picks q and q bounds given the target for bits
-int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi,
+int vp9_rc_pick_q_and_adjust_q_bounds(const struct VP9_COMP *cpi,
                                       int *bottom_index,
                                       int *top_index);
 
 // Estimates q to achieve a target bits per frame
-int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
+int vp9_rc_regulate_q(const struct VP9_COMP *cpi, int target_bits_per_frame,
                       int active_best_quality, int active_worst_quality);
 
 // Post encode update of the rate control parameters based
 // on bytes used
-void vp9_rc_postencode_update(VP9_COMP *cpi,
+void vp9_rc_postencode_update(struct VP9_COMP *cpi,
                               uint64_t bytes_used);
 // for dropped frames
-void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi);
+void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi);
 
 // estimates bits per mb for a given qindex and correction factor
 int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
                        double correction_factor);
 
 // Post encode update of the rate control parameters for 2-pass
-void vp9_twopass_postencode_update(VP9_COMP *cpi,
+void vp9_twopass_postencode_update(struct VP9_COMP *cpi,
                                    uint64_t bytes_used);
 
 // Decide if we should drop this frame: For 1-pass CBR.
-int vp9_drop_frame(VP9_COMP *cpi);
+int vp9_drop_frame(struct VP9_COMP *cpi);
 
 // Update the buffer level.
-void vp9_update_buffer_level(VP9_COMP *cpi, int encoded_frame_size);
+void vp9_update_buffer_level(struct VP9_COMP *cpi, int encoded_frame_size);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 0c80687..24b41a9 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2795,7 +2795,7 @@
         int j;
         int64_t rs_rd;
         mbmi->interp_filter = i;
-        vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+        xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
         rs = get_switchable_rate(x);
         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
 
@@ -2866,7 +2866,7 @@
   // Set the appropriate filter
   mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
       cm->interp_filter : *best_filter;
-  vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+  xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
   rs = cm->interp_filter == SWITCHABLE ? get_switchable_rate(x) : 0;
 
   if (pred_exists) {
@@ -3294,8 +3294,9 @@
 
     // Evaluate all sub-pel filters irrespective of whether we can use
     // them for this frame.
-    mbmi->interp_filter = cm->interp_filter;
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+    mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+                                                          : cm->interp_filter;
+    xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
 
     if (comp_pred) {
       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
@@ -3919,8 +3920,9 @@
 
     // Evaluate all sub-pel filters irrespective of whether we can use
     // them for this frame.
-    mbmi->interp_filter = cm->interp_filter;
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+    mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+                                                          : cm->interp_filter;
+    xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
 
     if (comp_pred) {
       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
@@ -4043,8 +4045,7 @@
             int newbest, rs;
             int64_t rs_rd;
             mbmi->interp_filter = switchable_filter_index;
-            vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-
+            xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
             tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
                                                  &mbmi->ref_mvs[ref_frame][0],
                                                  second_ref,
@@ -4109,7 +4110,7 @@
 
       mbmi->interp_filter = (cm->interp_filter == SWITCHABLE ?
                              tmp_best_filter : cm->interp_filter);
-      vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+      xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
       if (!pred_exists) {
         // Handles the special case when a filter that is not in the
         // switchable list (bilinear, 6-tap) is indicated at the frame level
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index c9a4246..e822e4c 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -392,7 +392,6 @@
   const int num_frames_backward = distance;
   const int num_frames_forward = vp9_lookahead_depth(cpi->lookahead)
                                - (num_frames_backward + 1);
-
   struct scale_factors sf;
 
   switch (blur_type) {
@@ -408,7 +407,6 @@
 
     case 2:
       // Forward Blur
-
       frames_to_blur_forward = num_frames_forward;
 
       if (frames_to_blur_forward >= max_frames)
@@ -471,22 +469,24 @@
                             strength, &sf);
 }
 
-void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame,
-                           const int group_boost) {
+void vp9_configure_arnr_filter(VP9_COMP *cpi,
+                               const unsigned int frames_to_arnr,
+                               const int group_boost) {
   int half_gf_int;
   int frames_after_arf;
   int frames_bwd = cpi->oxcf.arnr_max_frames - 1;
   int frames_fwd = cpi->oxcf.arnr_max_frames - 1;
   int q;
 
-  // Define the arnr filter width for this group of frames:
-  // We only filter frames that lie within a distance of half
-  // the GF interval from the ARF frame. We also have to trap
-  // cases where the filter extends beyond the end of clip.
-  // Note: this_frame->frame has been updated in the loop
-  // so it now points at the ARF frame.
+  // Define the arnr filter width for this group of frames. We only
+  // filter frames that lie within a distance of half the GF interval
+  // from the ARF frame. We also have to trap cases where the filter
+  // extends beyond the end of the lookahead buffer.
+  // Note: frames_to_arnr parameter is the offset of the arnr
+  // frame from the current frame.
   half_gf_int = cpi->rc.baseline_gf_interval >> 1;
-  frames_after_arf = (int)(cpi->twopass.total_stats.count - this_frame - 1);
+  frames_after_arf = vp9_lookahead_depth(cpi->lookahead)
+      - frames_to_arnr - 1;
 
   switch (cpi->oxcf.arnr_type) {
     case 1:  // Backward filter
diff --git a/vp9/encoder/vp9_temporal_filter.h b/vp9/encoder/vp9_temporal_filter.h
index eea2f10..3028d78 100644
--- a/vp9/encoder/vp9_temporal_filter.h
+++ b/vp9/encoder/vp9_temporal_filter.h
@@ -16,8 +16,9 @@
 #endif
 
 void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance);
-void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame,
-                           const int group_boost);
+void vp9_configure_arnr_filter(VP9_COMP *cpi,
+                               const unsigned int frames_to_arnr,
+                               const int group_boost);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vpxenc.c b/vpxenc.c
index f19300a..f1feb47 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -38,6 +38,7 @@
 #include "./tools_common.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_ports/vpx_timer.h"
+#include "./rate_hist.h"
 #include "./vpxstats.h"
 #include "./warnings.h"
 #include "./webmenc.h"
@@ -472,274 +473,6 @@
   exit(EXIT_FAILURE);
 }
 
-
-#define HIST_BAR_MAX 40
-struct hist_bucket {
-  int low, high, count;
-};
-
-
-static int merge_hist_buckets(struct hist_bucket *bucket,
-                              int *buckets_,
-                              int max_buckets) {
-  int small_bucket = 0, merge_bucket = INT_MAX, big_bucket = 0;
-  int buckets = *buckets_;
-  int i;
-
-  /* Find the extrema for this list of buckets */
-  big_bucket = small_bucket = 0;
-  for (i = 0; i < buckets; i++) {
-    if (bucket[i].count < bucket[small_bucket].count)
-      small_bucket = i;
-    if (bucket[i].count > bucket[big_bucket].count)
-      big_bucket = i;
-  }
-
-  /* If we have too many buckets, merge the smallest with an adjacent
-   * bucket.
-   */
-  while (buckets > max_buckets) {
-    int last_bucket = buckets - 1;
-
-    /* merge the small bucket with an adjacent one. */
-    if (small_bucket == 0)
-      merge_bucket = 1;
-    else if (small_bucket == last_bucket)
-      merge_bucket = last_bucket - 1;
-    else if (bucket[small_bucket - 1].count < bucket[small_bucket + 1].count)
-      merge_bucket = small_bucket - 1;
-    else
-      merge_bucket = small_bucket + 1;
-
-    assert(abs(merge_bucket - small_bucket) <= 1);
-    assert(small_bucket < buckets);
-    assert(big_bucket < buckets);
-    assert(merge_bucket < buckets);
-
-    if (merge_bucket < small_bucket) {
-      bucket[merge_bucket].high = bucket[small_bucket].high;
-      bucket[merge_bucket].count += bucket[small_bucket].count;
-    } else {
-      bucket[small_bucket].high = bucket[merge_bucket].high;
-      bucket[small_bucket].count += bucket[merge_bucket].count;
-      merge_bucket = small_bucket;
-    }
-
-    assert(bucket[merge_bucket].low != bucket[merge_bucket].high);
-
-    buckets--;
-
-    /* Remove the merge_bucket from the list, and find the new small
-     * and big buckets while we're at it
-     */
-    big_bucket = small_bucket = 0;
-    for (i = 0; i < buckets; i++) {
-      if (i > merge_bucket)
-        bucket[i] = bucket[i + 1];
-
-      if (bucket[i].count < bucket[small_bucket].count)
-        small_bucket = i;
-      if (bucket[i].count > bucket[big_bucket].count)
-        big_bucket = i;
-    }
-
-  }
-
-  *buckets_ = buckets;
-  return bucket[big_bucket].count;
-}
-
-
-static void show_histogram(const struct hist_bucket *bucket,
-                           int                       buckets,
-                           int                       total,
-                           int                       scale) {
-  const char *pat1, *pat2;
-  int i;
-
-  switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) {
-    case 1:
-    case 2:
-      pat1 = "%4d %2s: ";
-      pat2 = "%4d-%2d: ";
-      break;
-    case 3:
-      pat1 = "%5d %3s: ";
-      pat2 = "%5d-%3d: ";
-      break;
-    case 4:
-      pat1 = "%6d %4s: ";
-      pat2 = "%6d-%4d: ";
-      break;
-    case 5:
-      pat1 = "%7d %5s: ";
-      pat2 = "%7d-%5d: ";
-      break;
-    case 6:
-      pat1 = "%8d %6s: ";
-      pat2 = "%8d-%6d: ";
-      break;
-    case 7:
-      pat1 = "%9d %7s: ";
-      pat2 = "%9d-%7d: ";
-      break;
-    default:
-      pat1 = "%12d %10s: ";
-      pat2 = "%12d-%10d: ";
-      break;
-  }
-
-  for (i = 0; i < buckets; i++) {
-    int len;
-    int j;
-    float pct;
-
-    pct = (float)(100.0 * bucket[i].count / total);
-    len = HIST_BAR_MAX * bucket[i].count / scale;
-    if (len < 1)
-      len = 1;
-    assert(len <= HIST_BAR_MAX);
-
-    if (bucket[i].low == bucket[i].high)
-      fprintf(stderr, pat1, bucket[i].low, "");
-    else
-      fprintf(stderr, pat2, bucket[i].low, bucket[i].high);
-
-    for (j = 0; j < HIST_BAR_MAX; j++)
-      fprintf(stderr, j < len ? "=" : " ");
-    fprintf(stderr, "\t%5d (%6.2f%%)\n", bucket[i].count, pct);
-  }
-}
-
-
-static void show_q_histogram(const int counts[64], int max_buckets) {
-  struct hist_bucket bucket[64];
-  int buckets = 0;
-  int total = 0;
-  int scale;
-  int i;
-
-
-  for (i = 0; i < 64; i++) {
-    if (counts[i]) {
-      bucket[buckets].low = bucket[buckets].high = i;
-      bucket[buckets].count = counts[i];
-      buckets++;
-      total += counts[i];
-    }
-  }
-
-  fprintf(stderr, "\nQuantizer Selection:\n");
-  scale = merge_hist_buckets(bucket, &buckets, max_buckets);
-  show_histogram(bucket, buckets, total, scale);
-}
-
-
-#define RATE_BINS (100)
-struct rate_hist {
-  int64_t            *pts;
-  int                *sz;
-  int                 samples;
-  int                 frames;
-  struct hist_bucket  bucket[RATE_BINS];
-  int                 total;
-};
-
-
-static void init_rate_histogram(struct rate_hist *hist,
-                                const vpx_codec_enc_cfg_t *cfg,
-                                const vpx_rational_t *fps) {
-  int i;
-
-  /* Determine the number of samples in the buffer. Use the file's framerate
-   * to determine the number of frames in rc_buf_sz milliseconds, with an
-   * adjustment (5/4) to account for alt-refs
-   */
-  hist->samples = cfg->rc_buf_sz * 5 / 4 * fps->num / fps->den / 1000;
-
-  /* prevent division by zero */
-  if (hist->samples == 0)
-    hist->samples = 1;
-
-  hist->pts = calloc(hist->samples, sizeof(*hist->pts));
-  hist->sz = calloc(hist->samples, sizeof(*hist->sz));
-  for (i = 0; i < RATE_BINS; i++) {
-    hist->bucket[i].low = INT_MAX;
-    hist->bucket[i].high = 0;
-    hist->bucket[i].count = 0;
-  }
-}
-
-
-static void destroy_rate_histogram(struct rate_hist *hist) {
-  free(hist->pts);
-  free(hist->sz);
-}
-
-
-static void update_rate_histogram(struct rate_hist          *hist,
-                                  const vpx_codec_enc_cfg_t *cfg,
-                                  const vpx_codec_cx_pkt_t  *pkt) {
-  int i, idx;
-  int64_t now, then, sum_sz = 0, avg_bitrate;
-
-  now = pkt->data.frame.pts * 1000
-        * (uint64_t)cfg->g_timebase.num / (uint64_t)cfg->g_timebase.den;
-
-  idx = hist->frames++ % hist->samples;
-  hist->pts[idx] = now;
-  hist->sz[idx] = (int)pkt->data.frame.sz;
-
-  if (now < cfg->rc_buf_initial_sz)
-    return;
-
-  then = now;
-
-  /* Sum the size over the past rc_buf_sz ms */
-  for (i = hist->frames; i > 0 && hist->frames - i < hist->samples; i--) {
-    int i_idx = (i - 1) % hist->samples;
-
-    then = hist->pts[i_idx];
-    if (now - then > cfg->rc_buf_sz)
-      break;
-    sum_sz += hist->sz[i_idx];
-  }
-
-  if (now == then)
-    return;
-
-  avg_bitrate = sum_sz * 8 * 1000 / (now - then);
-  idx = (int)(avg_bitrate * (RATE_BINS / 2) / (cfg->rc_target_bitrate * 1000));
-  if (idx < 0)
-    idx = 0;
-  if (idx > RATE_BINS - 1)
-    idx = RATE_BINS - 1;
-  if (hist->bucket[idx].low > avg_bitrate)
-    hist->bucket[idx].low = (int)avg_bitrate;
-  if (hist->bucket[idx].high < avg_bitrate)
-    hist->bucket[idx].high = (int)avg_bitrate;
-  hist->bucket[idx].count++;
-  hist->total++;
-}
-
-
-static void show_rate_histogram(struct rate_hist          *hist,
-                                const vpx_codec_enc_cfg_t *cfg,
-                                int                        max_buckets) {
-  int i, scale;
-  int buckets = 0;
-
-  for (i = 0; i < RATE_BINS; i++) {
-    if (hist->bucket[i].low == INT_MAX)
-      continue;
-    hist->bucket[buckets++] = hist->bucket[i];
-  }
-
-  fprintf(stderr, "\nRate (over %dms window):\n", cfg->rc_buf_sz);
-  scale = merge_hist_buckets(hist->bucket, &buckets, max_buckets);
-  show_histogram(hist->bucket, buckets, hist->total, scale);
-}
-
 #define mmin(a, b)  ((a) < (b) ? (a) : (b))
 static void find_mismatch(vpx_image_t *img1, vpx_image_t *img2,
                           int yloc[4], int uloc[4], int vloc[4]) {
@@ -884,7 +617,7 @@
   struct stream_state      *next;
   struct stream_config      config;
   FILE                     *file;
-  struct rate_hist          rate_hist;
+  struct rate_hist         *rate_hist;
   struct EbmlGlobal         ebml;
   uint32_t                  hash;
   uint64_t                  psnr_sse_total;
@@ -1576,7 +1309,7 @@
         if (!global->quiet)
           fprintf(stderr, " %6luF", (unsigned long)pkt->data.frame.sz);
 
-        update_rate_histogram(&stream->rate_hist, cfg, pkt);
+        update_rate_histogram(stream->rate_hist, cfg, pkt);
         if (stream->config.write_webm) {
           /* Update the hash */
           if (!stream->ebml.debug)
@@ -1881,9 +1614,9 @@
                       : VPX_IMG_FMT_YV12,
                       input.width, input.height, 32);
 
-      FOREACH_STREAM(init_rate_histogram(&stream->rate_hist,
-                                         &stream->config.cfg,
-                                         &global.framerate));
+      FOREACH_STREAM(stream->rate_hist =
+                         init_rate_histogram(&stream->config.cfg,
+                                             &global.framerate));
     }
 
     FOREACH_STREAM(setup_pass(stream, &global, pass));
@@ -2020,10 +1753,10 @@
                                     global.show_q_hist_buckets));
 
   if (global.show_rate_hist_buckets)
-    FOREACH_STREAM(show_rate_histogram(&stream->rate_hist,
+    FOREACH_STREAM(show_rate_histogram(stream->rate_hist,
                                        &stream->config.cfg,
                                        global.show_rate_hist_buckets));
-  FOREACH_STREAM(destroy_rate_histogram(&stream->rate_hist));
+  FOREACH_STREAM(destroy_rate_histogram(stream->rate_hist));
 
 #if CONFIG_INTERNAL_STATS
   /* TODO(jkoleszar): This doesn't belong in this executable. Do it for now,