Merge "all_builds.py: support for sharding builds" into experimental
diff --git a/vp8/common/recon.h b/vp8/common/recon.h
index 3527fc1..0bb5c88 100644
--- a/vp8/common/recon.h
+++ b/vp8/common/recon.h
@@ -262,4 +262,12 @@
 
 void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd,
                           MACROBLOCKD *xd);
+
+#if CONFIG_SUPERBLOCKS
+extern void vp8_recon_mby_s_c(const vp8_recon_rtcd_vtable_t *rtcd,
+                              MACROBLOCKD *xd, uint8_t *dst);
+extern void vp8_recon_mbuv_s_c(const vp8_recon_rtcd_vtable_t *rtcd,
+                               MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst);
+#endif
+
 #endif
diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h
index 7ad0adb..37e34b5 100644
--- a/vp8/common/reconinter.h
+++ b/vp8/common/reconinter.h
@@ -45,6 +45,15 @@
                                                    int dst_ystride,
                                                    int dst_uvstride);
 
+#if CONFIG_SUPERBLOCKS
+extern void vp8_build_inter32x32_predictors_sb(MACROBLOCKD *x,
+                                               unsigned char *dst_y,
+                                               unsigned char *dst_u,
+                                               unsigned char *dst_v,
+                                               int dst_ystride,
+                                               int dst_uvstride);
+#endif
+
 extern void vp8_build_inter_predictors_mb(MACROBLOCKD *xd);
 
 extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
diff --git a/vp8/common/reconintra.c b/vp8/common/reconintra.c
index e391fa9..e84afa1 100644
--- a/vp8/common/reconintra.c
+++ b/vp8/common/reconintra.c
@@ -218,7 +218,7 @@
   int r, c, i;
 
   for (i = 0; i < bsize; i++) {
-    yleft_col[i] = xd->dst.y_buffer [i * src_stride - 1];
+    yleft_col[i] = src[i * src_stride - 1];
   }
 
   /* for Y */
diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh
index 1cb5de3..66029f8 100644
--- a/vp8/common/rtcd_defs.sh
+++ b/vp8/common/rtcd_defs.sh
@@ -14,8 +14,8 @@
 # compiles warning free but a dissassembly of generated code show bugs. To be
 # on the safe side, only enabled when compiled with 'gcc'.
 if [ "$CONFIG_GCC" = "yes" ]; then
-    specialize vp8_filter_block2d_4x4_8 sse4_1
-    specialize vp8_filter_block2d_8x4_8 sse4_1
-    specialize vp8_filter_block2d_8x8_8 sse4_1
-    specialize vp8_filter_block2d_16x16_8 sse4_1
+    specialize vp8_filter_block2d_4x4_8 sse4_1 sse2
+    specialize vp8_filter_block2d_8x4_8 sse4_1 sse2
+    specialize vp8_filter_block2d_8x8_8 sse4_1 sse2
+    specialize vp8_filter_block2d_16x16_8 sse4_1 sse2
 fi
diff --git a/vp8/common/x86/filter_sse2.c b/vp8/common/x86/filter_sse2.c
new file mode 100644
index 0000000..fe57b4e
--- /dev/null
+++ b/vp8/common/x86/filter_sse2.c
@@ -0,0 +1,289 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h> // for alignment checks
+#include <emmintrin.h> // SSE2
+#include "vp8/common/filter.h"
+#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
+#include "vpx_rtcd.h"
+
+// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
+//           just a quick partial snapshot so that other can already use some
+//           speedup.
+// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
+//           filtering.
+// TODO(cd): Add some comments, better variable naming.
+// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
+//           of positive above 128), or have higher precision filter
+//           coefficients.
+
+DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
+  VP8_FILTER_WEIGHT >> 1,
+  VP8_FILTER_WEIGHT >> 1,
+  VP8_FILTER_WEIGHT >> 1,
+  VP8_FILTER_WEIGHT >> 1,
+};
+
+// Creating a macro to do more than four pixels at once to hide instruction
+// latency is actually slower :-(
+#define DO_FOUR_PIXELS(result, src_ptr, offset)                                \
+  {                                                                            \
+  /* Do shifted load to achieve require shuffles through unpacking */          \
+  const __m128i src0  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \
+  const __m128i src1  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \
+  const __m128i src2  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \
+  const __m128i src3  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \
+  const __m128i src01 = _mm_unpacklo_epi8(src0, src1);                         \
+  const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero);                     \
+  const __m128i src23 = _mm_unpacklo_epi8(src2, src3);                         \
+  const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero);                     \
+  /* Shit by 4 bytes through suffle to get additional shifted loads */         \
+  const __m128i src4  = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1));      \
+  const __m128i src5  = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1));      \
+  const __m128i src6  = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1));      \
+  const __m128i src7  = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1));      \
+  const __m128i src45 = _mm_unpacklo_epi8(src4, src5);                         \
+  const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero);                     \
+  const __m128i src67 = _mm_unpacklo_epi8(src6, src7);                         \
+  const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero);                     \
+  /* multiply accumulate them */                                               \
+  const __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                       \
+  const __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                       \
+  const __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                       \
+  const __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                       \
+  const __m128i mad0123 = _mm_add_epi32(mad01, mad23);                         \
+  const __m128i mad4567 = _mm_add_epi32(mad45, mad67);                         \
+  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \
+  mad_all = _mm_add_epi32(mad_all, rounding);                                  \
+  result = _mm_srai_epi32(mad_all, VP8_FILTER_SHIFT);                          \
+  }
+
+void vp8_filter_block2d_4x4_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+  __m128i intermediateA, intermediateB, intermediateC;
+
+  const int kInterp_Extend = 4;
+
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
+
+  // check alignment
+  assert(0 == ((long)HFilter_aligned16)%16);
+  assert(0 == ((long)VFilter_aligned16)%16);
+
+  {
+    __m128i transpose3_0;
+    __m128i transpose3_1;
+    __m128i transpose3_2;
+    __m128i transpose3_3;
+
+    // Horizontal pass (src -> intermediate).
+    {
+      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
+      // get first two columns filter coefficients
+      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
+      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
+      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
+      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
+      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
+
+      {
+        __m128i mad_all0;
+        __m128i mad_all1;
+        __m128i mad_all2;
+        __m128i mad_all3;
+        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
+        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
+        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
+        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
+        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
+        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
+        // --
+        src_ptr += src_stride*4;
+        // --
+        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
+        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
+        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
+        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
+        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
+        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
+        // --
+        src_ptr += src_stride*4;
+        // --
+        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
+        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
+        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
+        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
+        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
+      }
+    }
+
+    // Transpose result (intermediate -> transpose3_x)
+    {
+      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
+      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
+      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
+      const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB);
+      const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB);
+      const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC);
+      const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC);
+      // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53
+      // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73
+      // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx
+      // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx
+      const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
+      const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
+      const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3);
+      const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3);
+      // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63
+      // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73
+      // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx
+      // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx
+      const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1);
+      const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1);
+      const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3);
+      const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3);
+      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+      // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx
+      // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx
+      transpose3_0 = _mm_castps_si128(
+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
+                                           _mm_castsi128_ps(transpose2_2),
+                                           _MM_SHUFFLE(1, 0, 1, 0)));
+      transpose3_1 = _mm_castps_si128(
+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
+                                           _mm_castsi128_ps(transpose2_2),
+                                           _MM_SHUFFLE(3, 2, 3, 2)));
+      transpose3_2 = _mm_castps_si128(
+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
+                                           _mm_castsi128_ps(transpose2_3),
+                                           _MM_SHUFFLE(1, 0, 1, 0)));
+      transpose3_3 = _mm_castps_si128(
+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
+                                           _mm_castsi128_ps(transpose2_3),
+                                           _MM_SHUFFLE(3, 2, 3, 2)));
+      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
+      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
+      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
+      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
+    }
+
+    // Vertical pass (transpose3_x -> dst).
+    {
+      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
+      // get first two columns filter coefficients
+      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
+      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
+      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
+      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
+      __m128i col0, col1, col2, col3;
+        DECLARE_ALIGNED(16, unsigned char, temp[32]);
+      {
+        _mm_store_si128((__m128i *)temp, transpose3_0);
+        DO_FOUR_PIXELS(col0, temp, 0);
+      }
+      {
+        _mm_store_si128((__m128i *)temp, transpose3_1);
+        DO_FOUR_PIXELS(col1, temp, 0);
+      }
+      {
+        _mm_store_si128((__m128i *)temp, transpose3_2);
+        DO_FOUR_PIXELS(col2, temp, 0);
+      }
+      {
+        _mm_store_si128((__m128i *)temp, transpose3_3);
+        DO_FOUR_PIXELS(col3, temp, 0);
+      }
+      // transpose
+      {
+        __m128i T0 = _mm_unpacklo_epi32(col0, col1);
+        __m128i T1 = _mm_unpacklo_epi32(col2, col3);
+        __m128i T2 = _mm_unpackhi_epi32(col0, col1);
+        __m128i T3 = _mm_unpackhi_epi32(col2, col3);
+        col0 = _mm_unpacklo_epi64(T0, T1);
+        col1 = _mm_unpackhi_epi64(T0, T1);
+        col2 = _mm_unpacklo_epi64(T2, T3);
+        col3 = _mm_unpackhi_epi64(T2, T3);
+      }
+      // saturate to 8 bit
+      {
+        col0 = _mm_packs_epi32(col0, col0);
+        col0 = _mm_packus_epi16(col0, col0);
+        col1 = _mm_packs_epi32(col1, col1);
+        col1 = _mm_packus_epi16(col1, col1);
+        col2 = _mm_packs_epi32 (col2, col2);
+        col2 = _mm_packus_epi16(col2, col2);
+        col3 = _mm_packs_epi32 (col3, col3);
+        col3 = _mm_packus_epi16(col3, col3);
+      }
+      // store
+      {
+        *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0);
+        *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1);
+        *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2);
+        *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3);
+      }
+    }
+  }
+}
+
+void vp8_filter_block2d_8x4_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+  int j;
+  for (j=0; j<8; j+=4) {
+    vp8_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride,
+                                  HFilter_aligned16, VFilter_aligned16,
+                                  dst_ptr + j, dst_stride);
+  }
+}
+
+void vp8_filter_block2d_8x8_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+  int i, j;
+  for (i=0; i<8; i+=4) {
+    for (j=0; j<8; j+=4) {
+      vp8_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
+                                    HFilter_aligned16, VFilter_aligned16,
+                                    dst_ptr + j + i*dst_stride, dst_stride);
+    }
+  }
+}
+
+void vp8_filter_block2d_16x16_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+  int i, j;
+  for (i=0; i<16; i+=4) {
+    for (j=0; j<16; j+=4) {
+      vp8_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
+                                    HFilter_aligned16, VFilter_aligned16,
+                                    dst_ptr + j + i*dst_stride, dst_stride);
+    }
+  }
+}
diff --git a/vp8/common/x86/filter_sse4.c b/vp8/common/x86/filter_sse4.c
index a037622..c461db1 100644
--- a/vp8/common/x86/filter_sse4.c
+++ b/vp8/common/x86/filter_sse4.c
@@ -25,9 +25,6 @@
 // TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
 //           of positive above 128), or have higher precision filter
 //           coefficients.
-// TODO(cd): Remove use of _mm_extract_epi32 and _mm_extract_epi64, to not
-//           require SSE4.1
-// TODO(cd): Remove use of _mm_shuffle_epi8 to not require SSSE3
 
 DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = {
   0x00, 0x01,
diff --git a/vp8/decoder/dequantize.h b/vp8/decoder/dequantize.h
index c4c8d4a..2326e46 100644
--- a/vp8/decoder/dequantize.h
+++ b/vp8/decoder/dequantize.h
@@ -201,5 +201,16 @@
                                    int pitch, int stride);
 #endif
 
+#if CONFIG_SUPERBLOCKS
+void vp8_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, short *dq,
+                                                   unsigned char *dst,
+                                                   int stride, char *eobs,
+                                                   short *dc, MACROBLOCKD *xd);
+void vp8_dequant_idct_add_uv_block_8x8_inplace_c(short *q, short *dq,
+                                                 unsigned char *dstu,
+                                                 unsigned char *dstv,
+                                                 int stride, char *eobs,
+                                                 MACROBLOCKD *xd);
+#endif
 
 #endif
diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c
index adff88a..a6c8370 100644
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -295,6 +295,7 @@
   const vp8_prob *prob, *coef_probs;
 
   switch (block_type) {
+    default:
     case TX_4X4:
       coef_probs = fc->coef_probs[type][0][0];
       break;
@@ -302,7 +303,7 @@
       coef_probs = fc->coef_probs_8x8[type][0][0];
       break;
 #if CONFIG_TX16X16
-    default:
+    case TX_16X16:
       coef_probs = fc->coef_probs_16x16[type][0][0];
       break;
 #endif
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 4472497..f834e0b 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -21,6 +21,7 @@
 #include "vp8/common/setupintrarecon.h"
 #include "encodeintra.h"
 #include "vp8/common/reconinter.h"
+#include "vp8/common/invtrans.h"
 #include "rdopt.h"
 #include "vp8/common/findnearmv.h"
 #include "vp8/common/reconintra.h"
@@ -76,7 +77,8 @@
                                     MACROBLOCK *x,
                                     TOKENEXTRA **t, int mb_col);
 static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x);
-
+extern void vp8_stuff_mb_8x8(VP8_COMP *cpi,
+                             MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run);
 
 #ifdef MODE_STATS
 unsigned int inter_y_modes[MB_MODE_COUNT];
@@ -852,7 +854,6 @@
                       MACROBLOCK  *x,
                       MACROBLOCKD *xd,
                       TOKENEXTRA **tp) {
-  VP8_COMMON *pc = cm;
   int i;
   int map_index;
   int mb_row, mb_col;
@@ -1693,7 +1694,6 @@
   // reset pointer, stuff EOBs where necessary
   *tp = t[0];
   for (n = 0; n < 4; n++) {
-    TOKENEXTRA *tbak = *tp;
     if (skip[n]) {
       x->e_mbd.above_context = &ta[n];
       x->e_mbd.left_context  = &tl[n];
@@ -1715,9 +1715,12 @@
   int n;
   MACROBLOCKD *xd = &x->e_mbd;
   VP8_COMMON *cm = &cpi->common;
-  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
-  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
-  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  const uint8_t *src = x->src.y_buffer;
+  uint8_t *dst = xd->dst.y_buffer;
+  const uint8_t *usrc = x->src.u_buffer;
+  uint8_t *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer;
+  uint8_t *vdst = xd->dst.v_buffer;
   int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
   int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
   const VP8_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
@@ -2041,13 +2044,15 @@
   const int output_enabled = 1;
   VP8_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
-  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
-  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  const uint8_t *src = x->src.y_buffer;
+  uint8_t *dst = xd->dst.y_buffer;
+  const uint8_t *usrc = x->src.u_buffer;
+  uint8_t *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer;
+  uint8_t *vdst = xd->dst.v_buffer;
   int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
   int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
   const VP8_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
-  int mis = xd->mode_info_stride;
   unsigned int segment_id = xd->mode_info_context->mbmi.segment_id;
   int seg_ref_active;
   unsigned char ref_pred_flag;
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index 8c48b0d..a263505 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -67,8 +67,10 @@
   }
 }
 
-void vp8_subtract_mbuv_s_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride,
-                           unsigned char *upred, unsigned char *vpred, int dst_stride) {
+void vp8_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,
+                           const unsigned char *vsrc, int src_stride,
+                           const unsigned char *upred,
+                           const unsigned char *vpred, int dst_stride) {
   short *udiff = diff + 256;
   short *vdiff = diff + 320;
 
@@ -95,14 +97,16 @@
   }
 }
 
-void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) {
+void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc,
+                         unsigned char *vsrc, unsigned char *pred, int stride) {
   unsigned char *upred = pred + 256;
   unsigned char *vpred = pred + 320;
 
   vp8_subtract_mbuv_s_c(diff, usrc, vsrc, stride, upred, vpred, 8);
 }
 
-void vp8_subtract_mby_s_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int dst_stride) {
+void vp8_subtract_mby_s_c(short *diff, const unsigned char *src, int src_stride,
+                          const unsigned char *pred, int dst_stride) {
   int r, c;
 
   for (r = 0; r < 16; r++) {
@@ -116,8 +120,8 @@
   }
 }
 
-void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride)
-{
+void vp8_subtract_mby_c(short *diff, unsigned char *src,
+                        unsigned char *pred, int stride) {
   vp8_subtract_mby_s_c(diff, src, stride, pred, 16);
 }
 
diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h
index 13ddcf1..653774a 100644
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h
@@ -132,4 +132,14 @@
 
 void vp8_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);
 
+#if CONFIG_SUPERBLOCKS
+void vp8_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,
+                           const unsigned char *vsrc, int src_stride,
+                           const unsigned char *upred,
+                           const unsigned char *vpred, int dst_stride);
+void vp8_subtract_mby_s_c(short *diff, const unsigned char *src,
+                          int src_stride, const unsigned char *pred,
+                          int dst_stride);
+#endif
+
 #endif
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 256c703..c3df544 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -630,7 +630,6 @@
   for (row = 0; row < sb_rows; row++) {
     for (col = 0; col < sb_cols; col++) {
       MODE_INFO *miptr = mi + col * 2;
-      uint8_t *seg = segmap + col * 2;
       uint8_t *cache = segcache + col * 2;
 #if CONFIG_SUPERBLOCKS
       if (miptr->mbmi.encoded_as_sb) {
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index d9b49bf..d217f2f 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -2943,7 +2943,7 @@
   int uv_intra_rate_8x8 = 0, uv_intra_distortion_8x8 = 0, uv_intra_rate_tokenonly_8x8 = 0;
   int uv_intra_skippable_8x8 = 0;
   int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);
-  int distortion_uv;
+  int distortion_uv = INT_MAX;
   int64_t best_yrd = INT64_MAX;
 #if CONFIG_PRED_FILTER
   int best_filter_state;
@@ -3856,7 +3856,6 @@
   int mode16x16;
   int mode8x8[2][4];
   int dist;
-  int rateuv8, rateuv_tokenonly8, distuv8;
 
   mbmi->ref_frame = INTRA_FRAME;
   rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
@@ -3961,7 +3960,6 @@
   BLOCKD *d = &xd->block[0];
   MB_PREDICTION_MODE this_mode;
   MV_REFERENCE_FRAME ref_frame;
-  int mis = xd->mode_info_stride;
   unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
   int comp_pred;
   int_mv best_ref_mv, second_best_ref_mv;
@@ -4313,11 +4311,11 @@
               if ((sse - var < q2dc *q2dc >> 4) ||
                   (sse / 2 > var && sse - var < 64)) {
                 // Check u and v to make sure skip is ok
-                int sse2, sse3;
-                int var2 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
+                unsigned int sse2, sse3;
+                var += VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
                                   (x->src.u_buffer, x->src.uv_stride,
                                    xd->dst.u_buffer, xd->dst.uv_stride, &sse2);
-                int var3 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
+                var += VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
                                   (x->src.v_buffer, x->src.uv_stride,
                                    xd->dst.v_buffer, xd->dst.uv_stride, &sse3);
                 sse2 += sse3;
@@ -4658,7 +4656,6 @@
                                       int recon_yoffset,
                                       int recon_uvoffset,
                                       int *totalrate, int *totaldist) {
-  VP8_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   int rate, distortion;
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 9f708ac..4d3d034 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -116,6 +116,11 @@
 vp8/common/x86/filter_sse4.c.o: CFLAGS += -msse4
 endif
 
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/filter_sse2.c
+ifeq ($(HAVE_SSE2),yes)
+vp8/common/x86/filter_sse2.c.o: CFLAGS += -msse2
+endif
+
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/arm_systemdependent.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.h