Merge "configure: use -Werror when testing CXX flags w/clang"
diff --git a/.gitignore b/.gitignore
index a93b6a0..dfeae99 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,18 +19,14 @@
 /config.log
 /config.mk
 /decode_to_md5
-/decode_to_md5.c
 /decode_to_md5.dox
 /decode_with_drops
-/decode_with_drops.c
 /decode_with_drops.dox
 /docs/
 /doxyfile
 /error_resilient
-/error_resilient.c
 /error_resilient.dox
 /force_keyframe
-/force_keyframe.c
 /force_keyframe.dox
 /ivfdec
 /ivfdec.dox
@@ -40,27 +36,21 @@
 /libvpx.ver
 /obj_int_extract
 /postproc
-/postproc.c
 /postproc.dox
 /samples.dox
 /simple_decoder
-/simple_decoder.c
 /simple_decoder.dox
 /simple_encoder
-/simple_encoder.c
 /simple_encoder.dox
 /test_libvpx
 /twopass_encoder
-/twopass_encoder.c
 /twopass_encoder.dox
 /vp8_api1_migration.dox
 /vp8_scalable_patterns
 /vp8_scalable_patterns.dox
 /vp8_set_maps
-/vp8_set_maps.c
 /vp8_set_maps.dox
 /vp8cx_set_ref
-/vp8cx_set_ref.c
 /vp8cx_set_ref.dox
 /vpx.pc
 /vpx_config.c
diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h
index 991d3c2..6ebea9f 100644
--- a/vp9/common/mips/dspr2/vp9_common_dspr2.h
+++ b/vp9/common/mips/dspr2/vp9_common_dspr2.h
@@ -85,8 +85,8 @@
   );
 }
 
-void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                      int dest_stride);
+void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                   int dest_stride);
 
 void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
diff --git a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
index 1b2f550..19c582f 100644
--- a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
@@ -19,8 +19,8 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output,
-                                 uint32_t no_rows) {
+static void idct16_rows_dspr2(const int16_t *input, int16_t *output,
+                              uint32_t no_rows) {
   int i;
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   int step1_10, step1_11, step1_12, step1_13;
@@ -404,8 +404,8 @@
   }
 }
 
-static void idct16_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                         int dest_stride) {
+static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                      int dest_stride) {
   int i;
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   int step1_8, step1_9, step1_10, step1_11;
@@ -905,13 +905,13 @@
   );
 
   // First transform rows
-  idct16_1d_rows_dspr2(input, out, 16);
+  idct16_rows_dspr2(input, out, 16);
 
   // Then transform columns and add to dest
-  idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
 }
 
-static void iadst16_1d(const int16_t *input, int16_t *output) {
+static void iadst16(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
 
   int x0 = input[15];
@@ -1099,16 +1099,16 @@
 
   switch (tx_type) {
     case DCT_DCT:     // DCT in both horizontal and vertical
-      idct16_1d_rows_dspr2(input, outptr, 16);
-      idct16_1d_cols_add_blk_dspr2(out, dest, pitch);
+      idct16_rows_dspr2(input, outptr, 16);
+      idct16_cols_add_blk_dspr2(out, dest, pitch);
       break;
     case ADST_DCT:    // ADST in vertical, DCT in horizontal
-      idct16_1d_rows_dspr2(input, outptr, 16);
+      idct16_rows_dspr2(input, outptr, 16);
 
       outptr = out;
 
       for (i = 0; i < 16; ++i) {
-        iadst16_1d(outptr, temp_out);
+        iadst16(outptr, temp_out);
 
         for (j = 0; j < 16; ++j)
           dest[j * pitch + i] =
@@ -1125,7 +1125,7 @@
         /* prefetch row */
         vp9_prefetch_load((const uint8_t *)(input + 16));
 
-        iadst16_1d(input, outptr);
+        iadst16(input, outptr);
         input += 16;
         outptr += 16;
       }
@@ -1134,7 +1134,7 @@
         for (j = 0; j < 16; ++j)
             temp_in[j * 16 + i] = out[i * 16 + j];
 
-      idct16_1d_cols_add_blk_dspr2(temp_in, dest, pitch);
+      idct16_cols_add_blk_dspr2(temp_in, dest, pitch);
     }
     break;
     case ADST_ADST:   // ADST in both directions
@@ -1145,7 +1145,7 @@
         /* prefetch row */
         vp9_prefetch_load((const uint8_t *)(input + 16));
 
-        iadst16_1d(input, outptr);
+        iadst16(input, outptr);
         input += 16;
         outptr += 16;
       }
@@ -1153,7 +1153,7 @@
       for (i = 0; i < 16; ++i) {
         for (j = 0; j < 16; ++j)
           temp_in[j] = out[j * 16 + i];
-        iadst16_1d(temp_in, temp_out);
+        iadst16(temp_in, temp_out);
         for (j = 0; j < 16; ++j)
           dest[j * pitch + i] =
                     clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
@@ -1183,7 +1183,7 @@
 
   // First transform rows. Since all non-zero dct coefficients are in
   // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  idct16_1d_rows_dspr2(input, outptr, 4);
+  idct16_rows_dspr2(input, outptr, 4);
 
   outptr += 4;
   for (i = 0; i < 6; ++i) {
@@ -1213,7 +1213,7 @@
   }
 
   // Then transform columns
-  idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
 }
 
 void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
diff --git a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
index 5e92db3..132d88c 100644
--- a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
@@ -18,8 +18,8 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                      int dest_stride) {
+void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                   int dest_stride) {
   int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
   int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
   int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
diff --git a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
index bc67594..74a90b0 100644
--- a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
@@ -19,8 +19,8 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output,
-                                 uint32_t no_rows) {
+static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
+                              uint32_t no_rows) {
   int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
   int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
   int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
@@ -882,10 +882,10 @@
   );
 
   // Rows
-  idct32_1d_rows_dspr2(input, outptr, 32);
+  idct32_rows_dspr2(input, outptr, 32);
 
   // Columns
-  vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+  vp9_idct32_cols_add_blk_dspr2(out, dest, dest_stride);
 }
 
 void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
@@ -903,7 +903,7 @@
   );
 
   // Rows
-  idct32_1d_rows_dspr2(input, outptr, 8);
+  idct32_rows_dspr2(input, outptr, 8);
 
   outptr += 8;
   __asm__ __volatile__ (
@@ -947,7 +947,7 @@
   }
 
   // Columns
-  vp9_idct32_1d_cols_add_blk_dspr2(out, dest, stride);
+  vp9_idct32_cols_add_blk_dspr2(out, dest, stride);
 }
 
 void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
diff --git a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
index 5b7aa5e..1990348 100644
--- a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
@@ -19,7 +19,7 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+static void vp9_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
   int16_t   step_0, step_1, step_2, step_3;
   int       Temp0, Temp1, Temp2, Temp3;
   const int const_2_power_13 = 8192;
@@ -104,7 +104,7 @@
   }
 }
 
-static void vp9_idct4_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+static void vp9_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
                                                int dest_stride) {
   int16_t   step_0, step_1, step_2, step_3;
   int       Temp0, Temp1, Temp2, Temp3;
@@ -240,10 +240,10 @@
   );
 
   // Rows
-  vp9_idct4_1d_rows_dspr2(input, outptr);
+  vp9_idct4_rows_dspr2(input, outptr);
 
   // Columns
-  vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+  vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
 }
 
 void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
@@ -319,7 +319,7 @@
   }
 }
 
-static void iadst4_1d_dspr2(const int16_t *input, int16_t *output) {
+static void iadst4_dspr2(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
   int x0, x1, x2, x3;
 
@@ -379,16 +379,16 @@
 
   switch (tx_type) {
     case DCT_DCT:   // DCT in both horizontal and vertical
-      vp9_idct4_1d_rows_dspr2(input, outptr);
-      vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      vp9_idct4_rows_dspr2(input, outptr);
+      vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
       break;
     case ADST_DCT:  // ADST in vertical, DCT in horizontal
-      vp9_idct4_1d_rows_dspr2(input, outptr);
+      vp9_idct4_rows_dspr2(input, outptr);
 
       outptr = out;
 
       for (i = 0; i < 4; ++i) {
-        iadst4_1d_dspr2(outptr, temp_out);
+        iadst4_dspr2(outptr, temp_out);
 
         for (j = 0; j < 4; ++j)
           dest[j * dest_stride + i] =
@@ -400,7 +400,7 @@
       break;
     case DCT_ADST:  // DCT in vertical, ADST in horizontal
       for (i = 0; i < 4; ++i) {
-        iadst4_1d_dspr2(input, outptr);
+        iadst4_dspr2(input, outptr);
         input  += 4;
         outptr += 4;
       }
@@ -410,11 +410,11 @@
           temp_in[i * 4 + j] = out[j * 4 + i];
         }
       }
-      vp9_idct4_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      vp9_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
       break;
     case ADST_ADST:  // ADST in both directions
       for (i = 0; i < 4; ++i) {
-        iadst4_1d_dspr2(input, outptr);
+        iadst4_dspr2(input, outptr);
         input  += 4;
         outptr += 4;
       }
@@ -422,7 +422,7 @@
       for (i = 0; i < 4; ++i) {
         for (j = 0; j < 4; ++j)
           temp_in[j] = out[j * 4 + i];
-        iadst4_1d_dspr2(temp_in, temp_out);
+        iadst4_dspr2(temp_in, temp_out);
 
         for (j = 0; j < 4; ++j)
           dest[j * dest_stride + i] =
diff --git a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
index 93a0840..acccaea 100644
--- a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
@@ -19,8 +19,8 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output,
-                                uint32_t no_rows) {
+static void idct8_rows_dspr2(const int16_t *input, int16_t *output,
+                             uint32_t no_rows) {
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   const int const_2_power_13 = 8192;
   int Temp0, Temp1, Temp2, Temp3, Temp4;
@@ -200,8 +200,8 @@
   }
 }
 
-static void idct8_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                           int dest_stride) {
+static void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                        int dest_stride) {
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   int Temp0, Temp1, Temp2, Temp3;
   int i;
@@ -462,13 +462,13 @@
   );
 
   // First transform rows
-  idct8_1d_rows_dspr2(input, outptr, 8);
+  idct8_rows_dspr2(input, outptr, 8);
 
   // Then transform columns and add to dest
-  idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
 }
 
-static void iadst8_1d_dspr2(const int16_t *input, int16_t *output) {
+static void iadst8_dspr2(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
   int x0, x1, x2, x3, x4, x5, x6, x7;
 
@@ -563,14 +563,14 @@
 
   switch (tx_type) {
     case DCT_DCT:     // DCT in both horizontal and vertical
-      idct8_1d_rows_dspr2(input, outptr, 8);
-      idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      idct8_rows_dspr2(input, outptr, 8);
+      idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
       break;
     case ADST_DCT:    // ADST in vertical, DCT in horizontal
-      idct8_1d_rows_dspr2(input, outptr, 8);
+      idct8_rows_dspr2(input, outptr, 8);
 
       for (i = 0; i < 8; ++i) {
-        iadst8_1d_dspr2(&out[i * 8], temp_out);
+        iadst8_dspr2(&out[i * 8], temp_out);
 
         for (j = 0; j < 8; ++j)
           dest[j * dest_stride + i] =
@@ -580,7 +580,7 @@
       break;
     case DCT_ADST:    // DCT in vertical, ADST in horizontal
       for (i = 0; i < 8; ++i) {
-        iadst8_1d_dspr2(input, outptr);
+        iadst8_dspr2(input, outptr);
         input += 8;
         outptr += 8;
       }
@@ -590,11 +590,11 @@
           temp_in[i * 8 + j] = out[j * 8 + i];
         }
       }
-      idct8_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
       break;
     case ADST_ADST:   // ADST in both directions
       for (i = 0; i < 8; ++i) {
-        iadst8_1d_dspr2(input, outptr);
+        iadst8_dspr2(input, outptr);
         input += 8;
         outptr += 8;
       }
@@ -603,7 +603,7 @@
         for (j = 0; j < 8; ++j)
           temp_in[j] = out[j * 8 + i];
 
-        iadst8_1d_dspr2(temp_in, temp_out);
+        iadst8_dspr2(temp_in, temp_out);
 
         for (j = 0; j < 8; ++j)
           dest[j * dest_stride + i] =
@@ -631,7 +631,7 @@
   );
 
   // First transform rows
-  idct8_1d_rows_dspr2(input, outptr, 4);
+  idct8_rows_dspr2(input, outptr, 4);
 
   outptr += 4;
 
@@ -659,7 +659,7 @@
 
 
   // Then transform columns and add to dest
-  idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
 }
 
 void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index d5c27a5..487f00c 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -218,27 +218,25 @@
     } else {  // inter/inter
       const int above_has_second = has_second_ref(above_mbmi);
       const int left_has_second = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
 
       if (above_has_second && left_has_second) {
-        pred_context = 1 + (above_mbmi->ref_frame[0] == LAST_FRAME ||
-                            above_mbmi->ref_frame[1] == LAST_FRAME ||
-                            left_mbmi->ref_frame[0] == LAST_FRAME ||
-                            left_mbmi->ref_frame[1] == LAST_FRAME);
+        pred_context = 1 + (above0 == LAST_FRAME || above1 == LAST_FRAME ||
+                            left0 == LAST_FRAME || left1 == LAST_FRAME);
       } else if (above_has_second || left_has_second) {
-        const MV_REFERENCE_FRAME rfs = !above_has_second ?
-            above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-        const MV_REFERENCE_FRAME crf1 = above_has_second ?
-            above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
-        const MV_REFERENCE_FRAME crf2 = above_has_second ?
-            above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
 
         if (rfs == LAST_FRAME)
           pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
         else
           pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
       } else {
-        pred_context = 2 * (above_mbmi->ref_frame[0] == LAST_FRAME) +
-                       2 * (left_mbmi->ref_frame[0] == LAST_FRAME);
+        pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME);
       }
     }
   } else if (has_above || has_left) {  // one edge available
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 8ab5fb1..feb0098 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -23,8 +23,8 @@
   const short *filter
 );
 
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt1, opt2) \
-void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
                                    uint8_t *dst, ptrdiff_t dst_stride, \
                                    const int16_t *filter_x, int x_step_q4, \
                                    const int16_t *filter_y, int y_step_q4, \
@@ -32,7 +32,7 @@
   if (step_q4 == 16 && filter[3] != 128) { \
     if (filter[0] || filter[1] || filter[2]) { \
       while (w >= 16) { \
-        vp9_filter_block1d16_##dir##8_##avg##opt1(src_start, src_stride, \
+        vp9_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, \
                                                   dst, dst_stride, \
                                                   h, filter); \
         src += 16; \
@@ -40,7 +40,7 @@
         w -= 16; \
       } \
       while (w >= 8) { \
-        vp9_filter_block1d8_##dir##8_##avg##opt1(src_start, src_stride, \
+        vp9_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, \
                                                  dst, dst_stride, \
                                                  h, filter); \
         src += 8; \
@@ -48,7 +48,7 @@
         w -= 8; \
       } \
       while (w >= 4) { \
-        vp9_filter_block1d4_##dir##8_##avg##opt1(src_start, src_stride, \
+        vp9_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, \
                                                  dst, dst_stride, \
                                                  h, filter); \
         src += 4; \
@@ -57,7 +57,7 @@
       } \
     } else { \
       while (w >= 16) { \
-        vp9_filter_block1d16_##dir##2_##avg##opt2(src, src_stride, \
+        vp9_filter_block1d16_##dir##2_##avg##opt(src, src_stride, \
                                                   dst, dst_stride, \
                                                   h, filter); \
         src += 16; \
@@ -65,7 +65,7 @@
         w -= 16; \
       } \
       while (w >= 8) { \
-        vp9_filter_block1d8_##dir##2_##avg##opt2(src, src_stride, \
+        vp9_filter_block1d8_##dir##2_##avg##opt(src, src_stride, \
                                                  dst, dst_stride, \
                                                  h, filter); \
         src += 8; \
@@ -73,7 +73,7 @@
         w -= 8; \
       } \
       while (w >= 4) { \
-        vp9_filter_block1d4_##dir##2_##avg##opt2(src, src_stride, \
+        vp9_filter_block1d4_##dir##2_##avg##opt(src, src_stride, \
                                                  dst, dst_stride, \
                                                  h, filter); \
         src += 4; \
@@ -136,18 +136,18 @@
 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
 
-filter8_1dfunction vp9_filter_block1d16_v2_sse2;
-filter8_1dfunction vp9_filter_block1d16_h2_sse2;
-filter8_1dfunction vp9_filter_block1d8_v2_sse2;
-filter8_1dfunction vp9_filter_block1d8_h2_sse2;
-filter8_1dfunction vp9_filter_block1d4_v2_sse2;
-filter8_1dfunction vp9_filter_block1d4_h2_sse2;
-filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
+filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
 
 // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
@@ -169,11 +169,11 @@
 //                                   const int16_t *filter_x, int x_step_q4,
 //                                   const int16_t *filter_y, int y_step_q4,
 //                                   int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3, sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3, sse2);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3, sse2);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-            ssse3, sse2);
+            ssse3);
 
 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
@@ -236,11 +236,10 @@
 //                                  const int16_t *filter_x, int x_step_q4,
 //                                  const int16_t *filter_y, int y_step_q4,
 //                                  int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2, sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2, sse2);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2, sse2);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2,
-            sse2);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
 
 // void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                         uint8_t *dst, ptrdiff_t dst_stride,
diff --git a/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm b/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm
new file mode 100644
index 0000000..b5e18fe
--- /dev/null
+++ b/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm
@@ -0,0 +1,422 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    psrldq      xmm3, 6
+    packsswb    xmm3, xmm3
+    pshuflw     xmm3, xmm3, 0b              ;k3_k4
+
+    movq        xmm2, rcx                   ;rounding
+    pshufd      xmm2, xmm2, 0
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+    punpcklbw   xmm0, xmm1
+    pmaddubsw   xmm0, xmm3
+
+    paddsw      xmm0, xmm2                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movd        [rdi], xmm0
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    psrldq      xmm7, 6
+    packsswb    xmm7, xmm7
+    pshuflw     xmm7, xmm7, 0b              ;k3_k4
+    punpcklwd   xmm7, xmm7
+
+    movq        xmm6, rcx                   ;rounding
+    pshufd      xmm6, xmm6, 0
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+    punpcklbw   xmm0, xmm1
+    pmaddubsw   xmm0, xmm7
+
+    paddsw      xmm0, xmm6                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack back to byte
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+    punpcklbw   xmm0, xmm1
+    punpckhbw   xmm2, xmm1
+    pmaddubsw   xmm0, xmm7
+    pmaddubsw   xmm2, xmm7
+
+    paddsw      xmm0, xmm6                  ;rounding
+    paddsw      xmm2, xmm6
+    psraw       xmm0, 7                     ;shift
+    psraw       xmm2, 7
+    packuswb    xmm0, xmm2                  ;pack back to byte
+
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+global sym(vp9_filter_block1d4_v2_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v2_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v2_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_v2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+
+    APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h2_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h2_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h2_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h2_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+
+    APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index eaedf1e..e895d0a 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -491,24 +491,26 @@
   } else {
     // Note how often each mode chosen as best
     cpi->mode_chosen_counts[mb_mode_index]++;
-    if (is_inter_block(mbmi) &&
-        (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) {
-      int_mv best_mv[2];
-      for (i = 0; i < 1 + has_second_ref(mbmi); ++i)
-        best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int;
-      vp9_update_mv_count(cpi, x, best_mv);
-    }
 
-    if (cm->interp_filter == SWITCHABLE && is_inter_mode(mbmi->mode)) {
-      const int ctx = vp9_get_pred_context_switchable_interp(xd);
-      ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
+    if (is_inter_block(mbmi)) {
+      if (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV) {
+        int_mv best_mv[2];
+        for (i = 0; i < 1 + has_second_ref(mbmi); ++i)
+          best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int;
+        vp9_update_mv_count(cpi, x, best_mv);
+      }
+
+      if (cm->interp_filter == SWITCHABLE) {
+        const int ctx = vp9_get_pred_context_switchable_interp(xd);
+        ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
+      }
     }
 
     cpi->rd_comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
     cpi->rd_comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
     cpi->rd_comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
 
-    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
       cpi->rd_filter_diff[i] += ctx->best_filter_diff[i];
   }
 }
@@ -1072,17 +1074,18 @@
   } else {
     // Note how often each mode chosen as best
     cpi->mode_chosen_counts[mb_mode_index]++;
-    if (is_inter_block(mbmi) &&
-        (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) {
-      int_mv best_mv[2];
-      for (i = 0; i < 1 + has_second_ref(mbmi); ++i)
-        best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int;
-      vp9_update_mv_count(cpi, x, best_mv);
-    }
+    if (is_inter_block(mbmi)) {
+      if (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV) {
+        int_mv best_mv[2];
+        for (i = 0; i < 1 + has_second_ref(mbmi); ++i)
+          best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int;
+        vp9_update_mv_count(cpi, x, best_mv);
+      }
 
-    if (cm->interp_filter == SWITCHABLE && is_inter_mode(mbmi->mode)) {
-      const int ctx = vp9_get_pred_context_switchable_interp(xd);
-      ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
+      if (cm->interp_filter == SWITCHABLE) {
+        const int ctx = vp9_get_pred_context_switchable_interp(xd);
+        ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
+      }
     }
   }
 }
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index dcebefb..838f74e 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -133,9 +133,9 @@
   return 1;
 }
 
-static void output_stats(const VP9_COMP            *cpi,
+static void output_stats(const VP9_COMP *cpi,
                          struct vpx_codec_pkt_list *pktlist,
-                         FIRSTPASS_STATS            *stats) {
+                         FIRSTPASS_STATS *stats) {
   struct vpx_codec_cx_pkt pkt;
   pkt.kind = VPX_CODEC_STATS_PKT;
   pkt.data.twopass_stats.buf = stats;
@@ -266,9 +266,9 @@
 
 // Calculate a modified Error used in distributing bits between easier and
 // harder frames.
-static double calculate_modified_err(VP9_COMP *cpi,
-                                     FIRSTPASS_STATS *this_frame) {
-  struct twopass_rc *const twopass = &cpi->twopass;
+static double calculate_modified_err(const VP9_COMP *cpi,
+                                     const FIRSTPASS_STATS *this_frame) {
+  const struct twopass_rc *const twopass = &cpi->twopass;
   const FIRSTPASS_STATS *const stats = &twopass->total_stats;
   const double av_err = stats->ssim_weighted_pred_err / stats->count;
   double modified_error = av_err * pow(this_frame->ssim_weighted_pred_err /
@@ -337,7 +337,7 @@
 }
 
 // This function returns the maximum target rate per frame.
-static int frame_max_bits(VP9_COMP *cpi) {
+static int frame_max_bits(const VP9_COMP *cpi) {
   int64_t max_bits =
     ((int64_t)cpi->rc.av_per_frame_bandwidth *
      (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
@@ -469,7 +469,7 @@
   TileInfo tile;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
-  PICK_MODE_CONTEXT *ctx = &x->sb64_context;
+  const PICK_MODE_CONTEXT *ctx = &x->sb64_context;
   int i;
 
   int recon_yoffset, recon_uvoffset;
@@ -907,7 +907,7 @@
   int q;
   const int num_mbs = cpi->common.MBs;
   int target_norm_bits_per_mb;
-  RATE_CONTROL *const rc = &cpi->rc;
+  const RATE_CONTROL *const rc = &cpi->rc;
 
   const double section_err = fpstats->coded_error / fpstats->count;
   const double err_per_mb = section_err / num_mbs;
@@ -1040,12 +1040,10 @@
 // Function to test for a condition where a complex transition is followed
 // by a static section. For example in slide shows where there is a fade
 // between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(
-  VP9_COMP *cpi,
-  int frame_interval,
-  int still_interval,
-  double loop_decay_rate,
-  double last_decay_rate) {
+static int detect_transition_to_still(VP9_COMP *cpi, int frame_interval,
+                                      int still_interval,
+                                      double loop_decay_rate,
+                                      double last_decay_rate) {
   int trans_to_still = 0;
 
   // Break clause to detect very still sections after motion
@@ -1413,7 +1411,7 @@
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
   double mv_ratio_accumulator_thresh;
-  int max_bits = frame_max_bits(cpi);     // Max for a single frame
+  const int max_bits = frame_max_bits(cpi);     // Max for a single frame
 
   unsigned int allow_alt_ref = cpi->oxcf.play_alternate &&
                                cpi->oxcf.lag_in_frames;
@@ -1638,27 +1636,23 @@
 #endif
 
   // Calculate the bits to be allocated to the group as a whole
-  if ((cpi->twopass.kf_group_bits > 0) &&
-      (cpi->twopass.kf_group_error_left > 0)) {
-    cpi->twopass.gf_group_bits =
-      (int64_t)(cpi->twopass.kf_group_bits *
+  if (twopass->kf_group_bits > 0 && twopass->kf_group_error_left > 0) {
+    twopass->gf_group_bits = (int64_t)(cpi->twopass.kf_group_bits *
                 (gf_group_err / cpi->twopass.kf_group_error_left));
   } else {
-    cpi->twopass.gf_group_bits = 0;
+    twopass->gf_group_bits = 0;
   }
-  cpi->twopass.gf_group_bits =
-    (cpi->twopass.gf_group_bits < 0)
-    ? 0
-    : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
-    ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits;
+  twopass->gf_group_bits = (twopass->gf_group_bits < 0) ?
+     0 : (twopass->gf_group_bits > twopass->kf_group_bits) ?
+     twopass->kf_group_bits : twopass->gf_group_bits;
 
   // Clip cpi->twopass.gf_group_bits based on user supplied data rate
   // variability limit (cpi->oxcf.two_pass_vbrmax_section)
-  if (cpi->twopass.gf_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
-    cpi->twopass.gf_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
+  if (twopass->gf_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
+    twopass->gf_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
 
   // Reset the file position
-  reset_fpf_position(&cpi->twopass, start_pos);
+  reset_fpf_position(twopass, start_pos);
 
   // Assign  bits to the arf or gf.
   for (i = 0; i <= (rc->source_alt_ref_pending &&
@@ -1686,17 +1680,17 @@
 
     // Calculate the number of bits to be spent on the gf or arf based on
     // the boost number
-    gf_bits = (int)((double)boost * (cpi->twopass.gf_group_bits /
-                                       (double)allocation_chunks));
+    gf_bits = (int)((double)boost * (twopass->gf_group_bits /
+                  (double)allocation_chunks));
 
     // If the frame that is to be boosted is simpler than the average for
     // the gf/arf group then use an alternative calculation
     // based on the error score of the frame itself
     if (rc->baseline_gf_interval < 1 ||
         mod_frame_err < gf_group_err / (double)rc->baseline_gf_interval) {
-      double alt_gf_grp_bits = (double)cpi->twopass.kf_group_bits  *
+      double alt_gf_grp_bits = (double)twopass->kf_group_bits  *
         (mod_frame_err * (double)rc->baseline_gf_interval) /
-        DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left);
+        DOUBLE_DIVIDE_CHECK(twopass->kf_group_error_left);
 
       int alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
                                            (double)allocation_chunks));
@@ -1707,9 +1701,9 @@
       // If it is harder than other frames in the group make sure it at
       // least receives an allocation in keeping with its relative error
       // score, otherwise it may be worse off than an "un-boosted" frame.
-      int alt_gf_bits = (int)((double)cpi->twopass.kf_group_bits *
+      int alt_gf_bits = (int)((double)twopass->kf_group_bits *
                         mod_frame_err /
-                        DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left));
+                        DOUBLE_DIVIDE_CHECK(twopass->kf_group_error_left));
 
       if (alt_gf_bits > gf_bits)
         gf_bits = alt_gf_bits;
@@ -1720,11 +1714,11 @@
       gf_bits = 0;
 
     if (i == 0) {
-      cpi->twopass.gf_bits = gf_bits;
+      twopass->gf_bits = gf_bits;
     }
     if (i == 1 ||
         (!rc->source_alt_ref_pending &&
-         (cpi->common.frame_type != KEY_FRAME))) {
+         cpi->common.frame_type != KEY_FRAME)) {
       // Per frame bit target for this frame
       vp9_rc_set_frame_target(cpi, gf_bits);
     }
@@ -1732,11 +1726,11 @@
 
   {
     // Adjust KF group bits and error remaining
-    cpi->twopass.kf_group_error_left -= (int64_t)gf_group_err;
-    cpi->twopass.kf_group_bits -= cpi->twopass.gf_group_bits;
+    twopass->kf_group_error_left -= (int64_t)gf_group_err;
+    twopass->kf_group_bits -= twopass->gf_group_bits;
 
-    if (cpi->twopass.kf_group_bits < 0)
-      cpi->twopass.kf_group_bits = 0;
+    if (twopass->kf_group_bits < 0)
+      twopass->kf_group_bits = 0;
 
     // If this is an arf update we want to remove the score for the
     // overlay frame at the end which will usually be very cheap to code.
@@ -1745,18 +1739,18 @@
     // For normal GFs remove the score for the GF itself unless this is
     // also a key frame in which case it has already been accounted for.
     if (rc->source_alt_ref_pending) {
-      cpi->twopass.gf_group_error_left = (int64_t)gf_group_err - mod_frame_err;
+      twopass->gf_group_error_left = (int64_t)gf_group_err - mod_frame_err;
     } else if (cpi->common.frame_type != KEY_FRAME) {
-      cpi->twopass.gf_group_error_left = (int64_t)(gf_group_err
+      twopass->gf_group_error_left = (int64_t)(gf_group_err
                                                    - gf_first_frame_err);
     } else {
-      cpi->twopass.gf_group_error_left = (int64_t)gf_group_err;
+      twopass->gf_group_error_left = (int64_t)gf_group_err;
     }
 
-    cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits;
+    twopass->gf_group_bits -= twopass->gf_bits;
 
-    if (cpi->twopass.gf_group_bits < 0)
-      cpi->twopass.gf_group_bits = 0;
+    if (twopass->gf_group_bits < 0)
+      twopass->gf_group_bits = 0;
 
     // This condition could fail if there are two kfs very close together
     // despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the
@@ -1765,12 +1759,10 @@
       const int boost = rc->source_alt_ref_pending ? b_boost : rc->gfu_boost;
 
       if (boost >= 150) {
-        int alt_extra_bits;
-        int pct_extra = (boost - 100) / 50;
-        pct_extra = (pct_extra > 20) ? 20 : pct_extra;
-
-        alt_extra_bits = (int)((cpi->twopass.gf_group_bits * pct_extra) / 100);
-        cpi->twopass.gf_group_bits -= alt_extra_bits;
+        const int pct_extra = MIN(20, (boost - 100) / 50);
+        const int alt_extra_bits = (int)((twopass->gf_group_bits * pct_extra) /
+                                       100);
+        twopass->gf_group_bits -= alt_extra_bits;
       }
     }
   }
@@ -1779,20 +1771,20 @@
     FIRSTPASS_STATS sectionstats;
 
     zero_stats(&sectionstats);
-    reset_fpf_position(&cpi->twopass, start_pos);
+    reset_fpf_position(twopass, start_pos);
 
     for (i = 0; i < rc->baseline_gf_interval; i++) {
-      input_stats(&cpi->twopass, &next_frame);
+      input_stats(twopass, &next_frame);
       accumulate_stats(&sectionstats, &next_frame);
     }
 
     avg_stats(&sectionstats);
 
-    cpi->twopass.section_intra_rating = (int)
+    twopass->section_intra_rating = (int)
       (sectionstats.intra_error /
       DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
 
-    reset_fpf_position(&cpi->twopass, start_pos);
+    reset_fpf_position(twopass, start_pos);
   }
 }
 
@@ -1832,9 +1824,9 @@
 }
 
 static int test_candidate_kf(VP9_COMP *cpi,
-                             FIRSTPASS_STATS *last_frame,
-                             FIRSTPASS_STATS *this_frame,
-                             FIRSTPASS_STATS *next_frame) {
+                             const FIRSTPASS_STATS *last_frame,
+                             const FIRSTPASS_STATS *this_frame,
+                             const FIRSTPASS_STATS *next_frame) {
   int is_viable_kf = 0;
 
   // Does the frame satisfy the primary criteria of a key frame
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index fe0a291..b4972be 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -2417,34 +2417,33 @@
 
 // Function to test for conditions that indicate we should loop
 // back and recode a frame.
-static int recode_loop_test(VP9_COMP *cpi,
+static int recode_loop_test(const VP9_COMP *cpi,
                             int high_limit, int low_limit,
                             int q, int maxq, int minq) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
   int force_recode = 0;
-  VP9_COMMON *cm = &cpi->common;
 
   // Special case trap if maximum allowed frame size exceeded.
-  if (cpi->rc.projected_frame_size > cpi->rc.max_frame_bandwidth) {
+  if (rc->projected_frame_size > rc->max_frame_bandwidth) {
     force_recode = 1;
 
   // Is frame recode allowed.
   // Yes if either recode mode 1 is selected or mode 2 is selected
   // and the frame is a key frame, golden frame or alt_ref_frame
   } else if ((cpi->sf.recode_loop == 1) ||
-      ((cpi->sf.recode_loop == 2) &&
-       ((cm->frame_type == KEY_FRAME) ||
-        cpi->refresh_golden_frame ||
-        cpi->refresh_alt_ref_frame))) {
+             ((cpi->sf.recode_loop == 2) &&
+              (cm->frame_type == KEY_FRAME ||
+               cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
     // General over and under shoot tests
-    if (((cpi->rc.projected_frame_size > high_limit) && (q < maxq)) ||
-        ((cpi->rc.projected_frame_size < low_limit) && (q > minq))) {
+    if ((rc->projected_frame_size > high_limit && q < maxq) ||
+        (rc->projected_frame_size < low_limit && q > minq)) {
       force_recode = 1;
     } else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
       // Deal with frame undershoot and whether or not we are
       // below the automatically set cq level.
       if (q > cpi->cq_target_quality &&
-          cpi->rc.projected_frame_size <
-          ((cpi->rc.this_frame_target * 7) >> 3)) {
+          rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) {
         force_recode = 1;
       }
     }
@@ -2918,14 +2917,14 @@
   int bottom_index;
 
   SPEED_FEATURES *const sf = &cpi->sf;
-  unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
+  unsigned int max_mv_def = MIN(cm->width, cm->height);
   struct segmentation *const seg = &cm->seg;
 
   set_ext_overrides(cpi);
 
   /* Scale the source buffer, if required. */
-  if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width ||
-      cm->mi_rows * 8 != cpi->un_scaled_source->y_height) {
+  if (cm->mi_cols * MI_SIZE != cpi->un_scaled_source->y_width ||
+      cm->mi_rows * MI_SIZE != cpi->un_scaled_source->y_height) {
     scale_and_extend_frame_nonnormative(cpi->un_scaled_source,
                                         &cpi->scaled_source);
     cpi->Source = &cpi->scaled_source;
@@ -2934,12 +2933,8 @@
   }
   scale_references(cpi);
 
-  // Clear down mmx registers to allow floating point in what follows.
   vp9_clear_system_state();
 
-  // Clear zbin over-quant value and mode boost values.
-  cpi->zbin_mode_boost = 0;
-
   // Enable or disable mode based tweaking of the zbin.
   // For 2 pass only used where GF/ARF prediction quality
   // is above a threshold.
@@ -2947,7 +2942,7 @@
   cpi->zbin_mode_boost_enabled = 0;
 
   // Current default encoder behavior for the altref sign bias.
-  cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = cpi->rc.source_alt_ref_active;
+  cm->ref_frame_sign_bias[ALTREF_FRAME] = cpi->rc.source_alt_ref_active;
 
   // Set default state for segment based loop filter update flags.
   cm->lf.mode_ref_delta_update = 0;
@@ -2956,7 +2951,7 @@
   cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def);
   // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
   if (sf->auto_mv_step_size) {
-    if (frame_is_intra_only(&cpi->common)) {
+    if (frame_is_intra_only(cm)) {
       // Initialize max_mv_magnitude for use in the first INTER frame
       // after a key/intra-only frame.
       cpi->max_mv_magnitude = max_mv_def;
@@ -2965,8 +2960,8 @@
         // Allow mv_steps to correspond to twice the max mv magnitude found
         // in the previous frame, capped by the default max_mv_magnitude based
         // on resolution.
-        cpi->mv_step_param = vp9_init_search_range(
-            cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+        cpi->mv_step_param = vp9_init_search_range(cpi, MIN(max_mv_def, 2 *
+                                 cpi->max_mv_magnitude));
       cpi->max_mv_magnitude = 0;
     }
   }
@@ -3003,9 +2998,8 @@
   // static regions if indicated.
   // Only allowed in second pass of two pass (as requires lagged coding)
   // and if the relevant speed feature flag is set.
-  if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) {
+  if (cpi->pass == 2 && cpi->sf.static_segmentation)
     configure_static_seg_features(cpi);
-  }
 
   // For 1 pass CBR, check if we are dropping this frame.
   // Never drop on key frame.
@@ -3066,7 +3060,7 @@
   // should be larger. Q of 0 is disabled because we force tx size to be
   // 16x16...
   if (cpi->sf.super_fast_rtc) {
-    if (cpi->common.current_video_frame == 0)
+    if (cm->current_video_frame == 0)
       q /= 3;
 
     if (q == 0)
@@ -3079,14 +3073,8 @@
     set_high_precision_mv(cpi, (q < HIGH_PRECISION_MV_QTHRESH));
   }
 
-  encode_with_recode_loop(cpi,
-                          size,
-                          dest,
-                          &q,
-                          bottom_index,
-                          top_index,
-                          frame_over_shoot_limit,
-                          frame_under_shoot_limit);
+  encode_with_recode_loop(cpi, size, dest, &q, bottom_index, top_index,
+                          frame_over_shoot_limit, frame_under_shoot_limit);
 
   // Special case code to reduce pulsing when key frames are forced at a
   // fixed interval. Note the reconstruction error if it is the frame before
@@ -3133,18 +3121,15 @@
   update_reference_frames(cpi);
 
   for (t = TX_4X4; t <= TX_32X32; t++)
-    full_to_model_counts(cpi->common.counts.coef[t],
-                         cpi->coef_counts[t]);
-  if (!cpi->common.error_resilient_mode &&
-      !cpi->common.frame_parallel_decoding_mode) {
-    vp9_adapt_coef_probs(&cpi->common);
-  }
+    full_to_model_counts(cm->counts.coef[t], cpi->coef_counts[t]);
 
-  if (!frame_is_intra_only(&cpi->common)) {
-    if (!cpi->common.error_resilient_mode &&
-        !cpi->common.frame_parallel_decoding_mode) {
-      vp9_adapt_mode_probs(&cpi->common);
-      vp9_adapt_mv_probs(&cpi->common, cpi->common.allow_high_precision_mv);
+  if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode)
+    vp9_adapt_coef_probs(cm);
+
+  if (!frame_is_intra_only(cm)) {
+    if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
+      vp9_adapt_mode_probs(cm);
+      vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv);
     }
   }
 
@@ -3156,14 +3141,14 @@
   output_frame_level_debug_stats(cpi);
 #endif
   if (cpi->refresh_golden_frame == 1)
-    cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
+    cm->frame_flags |= FRAMEFLAGS_GOLDEN;
   else
-    cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN;
+    cm->frame_flags &= ~FRAMEFLAGS_GOLDEN;
 
   if (cpi->refresh_alt_ref_frame == 1)
-    cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF;
+    cm->frame_flags |= FRAMEFLAGS_ALTREF;
   else
-    cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF;
+    cm->frame_flags &= ~FRAMEFLAGS_ALTREF;
 
   get_ref_frame_flags(cpi);
 
@@ -3212,6 +3197,7 @@
   // reset to normal state now that we are done.
   if (!cm->show_existing_frame)
     cm->last_show_frame = cm->show_frame;
+
   if (cm->show_frame) {
     // current mip will be the prev_mip for the next frame
     MODE_INFO *temp = cm->prev_mip;
@@ -3232,6 +3218,7 @@
     // update not a real frame
     ++cm->current_video_frame;
   }
+
   // restore prev_mi
   cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
   cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1;
diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad.c
similarity index 100%
rename from vp9/encoder/vp9_sad_c.c
rename to vp9/encoder/vp9_sad.c
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index c691411..f9ba41b 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -76,6 +76,7 @@
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 131d6c7..c225f54 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -56,7 +56,7 @@
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c
 VP9_CX_SRCS-yes += encoder/vp9_rdopt.c
 VP9_CX_SRCS-yes += encoder/vp9_pickmode.c
-VP9_CX_SRCS-yes += encoder/vp9_sad_c.c
+VP9_CX_SRCS-yes += encoder/vp9_sad.c
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.c
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.h
 VP9_CX_SRCS-yes += encoder/vp9_subexp.c