Merge "Use uniform quantizer for sub8x8 block coding" into nextgenv2

diff --git a/test/vp10_convolve_optimz_test.cc b/test/vp10_convolve_optimz_test.cc
index fa460bf..04c4321 100644
--- a/test/vp10_convolve_optimz_test.cc
+++ b/test/vp10_convolve_optimz_test.cc

@@ -21,13 +21,15 @@
 using std::tr1::tuple;
 using libvpx_test::ACMRandom;
 
-typedef void (*conv_horiz_t)(const uint8_t*, int, uint8_t*, int,
-                             int, int, const InterpFilterParams,
-                             const int, int, int);
+typedef void (*conv_filter_t)(const uint8_t*, int, uint8_t*, int,
+                              int, int, const InterpFilterParams,
+                              const int, int, int);
 // Test parameter list:
-//  <convolve_horiz_func, <width, height>, filter_params, subpel_x_q4, avg>
+//  <convolve_horiz_func, convolve_vert_func,
+//  <width, height>, filter_params, subpel_x_q4, avg>
 typedef tuple<int, int> BlockDimension;
-typedef tuple<conv_horiz_t, BlockDimension, INTERP_FILTER, int, int> ConvParams;
+typedef tuple<conv_filter_t, conv_filter_t, BlockDimension, INTERP_FILTER,
+              int, int> ConvParams;
 
 // Note:
 //  src_ and src_ref_ have special boundary requirement
@@ -44,13 +46,14 @@
  public:
   virtual ~VP10ConvolveOptimzTest() {}
   virtual void SetUp() {
-    conv_ = GET_PARAM(0);
-    BlockDimension block = GET_PARAM(1);
+    conv_horiz_ = GET_PARAM(0);
+    conv_vert_ = GET_PARAM(1);
+    BlockDimension block = GET_PARAM(2);
     width_ = std::tr1::get<0>(block);
     height_ = std::tr1::get<1>(block);
-    filter_ = GET_PARAM(2);
-    subpel_ = GET_PARAM(3);
-    avg_ = GET_PARAM(4);
+    filter_ = GET_PARAM(3);
+    subpel_ = GET_PARAM(4);
+    avg_ = GET_PARAM(5);
 
     alloc_ = new uint8_t[maxBlockSize * 4];
     src_ = alloc_ + (vertiOffset * maxWidth);
@@ -68,6 +71,7 @@
 
  protected:
   void RunHorizFilterBitExactCheck();
+  void RunVertFilterBitExactCheck();
 
  private:
   void PrepFilterBuffer(uint8_t *src, uint8_t *src_ref,
@@ -75,7 +79,8 @@
                         int w, int h);
   void DiffFilterBuffer(const uint8_t *buf, const uint8_t *buf_ref,
                         int w, int h, int fgroup, int findex);
-  conv_horiz_t conv_;
+  conv_filter_t conv_horiz_;
+  conv_filter_t conv_vert_;
   uint8_t *alloc_;
   uint8_t *src_;
   uint8_t *dst_;
@@ -94,10 +99,7 @@
   int r, c;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
 
-  memset(src, 0, maxBlockSize);
-  memset(src_ref, 0, maxBlockSize);
-  memset(dst, 0, maxBlockSize);
-  memset(dst_ref, 0, maxBlockSize);
+  memset(alloc_, 0, 4 * maxBlockSize * sizeof(alloc_[0]));
 
   uint8_t *src_ptr = src;
   uint8_t *dst_ptr = dst;
@@ -147,8 +149,8 @@
   vp10_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_, height_,
                         filter_params, subpel_, x_step_q4, avg_);
 
-  conv_(src_, stride, dst_, stride, width_, height_,
-        filter_params, subpel_, x_step_q4, avg_);
+  conv_horiz_(src_, stride, dst_, stride, width_, height_,
+              filter_params, subpel_, x_step_q4, avg_);
 
   DiffFilterBuffer(dst_, dst_ref_, width_, height_, filter_, subpel_);
 
@@ -163,21 +165,40 @@
                         intermediate_height, filter_params, subpel_, x_step_q4,
                         avg_);
 
-  conv_(src_, stride, dst_, stride, width_,
-        intermediate_height, filter_params, subpel_, x_step_q4,
-        avg_);
+  conv_horiz_(src_, stride, dst_, stride, width_,
+              intermediate_height, filter_params, subpel_, x_step_q4,
+              avg_);
 
   DiffFilterBuffer(dst_, dst_ref_, width_, intermediate_height, filter_,
                    subpel_);
 }
 
+void VP10ConvolveOptimzTest::RunVertFilterBitExactCheck() {
+  PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, width_, height_);
+
+  InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
+
+  vp10_convolve_vert_c(src_ref_, stride, dst_ref_, stride, width_, height_,
+                       filter_params, subpel_, x_step_q4, avg_);
+
+  conv_vert_(src_, stride, dst_, stride, width_, height_,
+             filter_params, subpel_, x_step_q4, avg_);
+
+  DiffFilterBuffer(dst_, dst_ref_, width_, height_, filter_, subpel_);
+}
+
 TEST_P(VP10ConvolveOptimzTest, HorizBitExactCheck) {
   RunHorizFilterBitExactCheck();
 }
+TEST_P(VP10ConvolveOptimzTest, VerticalBitExactCheck) {
+  RunVertFilterBitExactCheck();
+}
 
 using std::tr1::make_tuple;
 
 const BlockDimension kBlockDim[] = {
+  make_tuple(2, 2),
+  make_tuple(2, 4),
   make_tuple(4, 4),
   make_tuple(4, 8),
   make_tuple(8, 4),
@@ -198,7 +219,7 @@
 // 10/12-tap filters
 const INTERP_FILTER kFilter[] = {6, 4, 2};
 
-const int kSubpelXQ4[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+const int kSubpelQ4[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 
 const int kAvg[] = {0, 1};
 
@@ -207,9 +228,10 @@
     SSSE3, VP10ConvolveOptimzTest,
     ::testing::Combine(
          ::testing::Values(vp10_convolve_horiz_ssse3),
+         ::testing::Values(vp10_convolve_vert_ssse3),
          ::testing::ValuesIn(kBlockDim),
          ::testing::ValuesIn(kFilter),
-         ::testing::ValuesIn(kSubpelXQ4),
+         ::testing::ValuesIn(kSubpelQ4),
          ::testing::ValuesIn(kAvg)));
 #endif  // HAVE_SSSE3 && CONFIG_EXT_INTERP
 }  // namespace

diff --git a/test/vp10_convolve_test.cc b/test/vp10_convolve_test.cc
index 901f578..0d6bbcd 100644
--- a/test/vp10_convolve_test.cc
+++ b/test/vp10_convolve_test.cc

@@ -11,6 +11,13 @@
 using libvpx_test::ACMRandom;
 
 namespace {
+void setup_convolve() {
+#if HAVE_SSSE3
+  vp10_convolve_horiz = vp10_convolve_horiz_c;
+  vp10_convolve_vert = vp10_convolve_vert_c;
+#endif
+}
+
 TEST(VP10ConvolveTest, vp10_convolve8) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
 #if CONFIG_DUAL_FILTER
@@ -41,7 +48,7 @@
   int w = 1;
   int h = 1;
 
-  vp10_rtcd();
+  setup_convolve();
 
   for (int i = 0; i < filter_size * filter_size; i++) {
     src[i] = rnd.Rand16() % (1 << 8);
@@ -89,7 +96,7 @@
   int subpel_x_q4;
   int subpel_y_q4;
 
-  vp10_rtcd();
+  setup_convolve();
 
   for (int i = 0; i < filter_size * filter_size; i++) {
     src[i] = rnd.Rand16() % (1 << 8);
@@ -155,7 +162,7 @@
   int subpel_x_q4;
   int subpel_y_q4;
 
-  vp10_rtcd();
+  setup_convolve();
 
   for (int i = 0; i < filter_size * filter_size; i++) {
     src0[i] = rnd.Rand16() % (1 << 8);

diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index e30f59a..6b090bf 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h

@@ -90,49 +90,8 @@
 #endif  // CONFIG_EXT_INTER
 } b_mode_info;
 
-// Note that the rate-distortion optimization loop, bit-stream writer, and
-// decoder implementation modules critically rely on the defined entry values
-// specified herein. They should be refactored concurrently.
-
-#define NONE           -1
-#define INTRA_FRAME     0
-#define LAST_FRAME      1
-
-#if CONFIG_EXT_REFS
-
-#define LAST2_FRAME     2
-#define LAST3_FRAME     3
-#define GOLDEN_FRAME    4
-#define BWDREF_FRAME    5
-#define ALTREF_FRAME    6
-#define MAX_REF_FRAMES  7
-#define LAST_REF_FRAMES (LAST3_FRAME - LAST_FRAME + 1)
-
-#else
-
-#define GOLDEN_FRAME    2
-#define ALTREF_FRAME    3
-#define MAX_REF_FRAMES  4
-#endif  // CONFIG_EXT_REFS
-
 typedef int8_t MV_REFERENCE_FRAME;
 
-#define FWD_REF_FRAMES (GOLDEN_FRAME - LAST_FRAME + 1)
-#define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
-#if CONFIG_EXT_REFS
-#define BWD_REF_FRAMES (ALTREF_FRAME - BWDREF_FRAME + 1)
-#define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
-#else
-#define BWD_REF_FRAMES 1
-#define BWD_RF_OFFSET(ref) (ref - ALTREF_FRAME)
-#endif
-
-#if CONFIG_REF_MV
-#define MODE_CTX_REF_FRAMES (MAX_REF_FRAMES + FWD_REF_FRAMES * BWD_REF_FRAMES)
-#else
-#define MODE_CTX_REF_FRAMES MAX_REF_FRAMES
-#endif
-
 typedef struct {
   // Number of base colors for Y (0) and UV (1)
   uint8_t palette_size[2];

diff --git a/vp10/common/enums.h b/vp10/common/enums.h
index d75b0f7..d1ce121 100644
--- a/vp10/common/enums.h
+++ b/vp10/common/enums.h

@@ -381,17 +381,46 @@
 typedef TX_SIZE TXFM_CONTEXT;
 #endif
 
+#define NONE           -1
+#define INTRA_FRAME     0
+#define LAST_FRAME      1
+
 #if CONFIG_EXT_REFS
-#define FWD_REFS 4
-#define BWD_REFS 2
-#define SINGLE_REFS (FWD_REFS + BWD_REFS)
-// NOTE(zoeliu): Following parameter is currently not being used
-// #define COMP_REFS (FWD_REFS * BWD_REFS)
+
+#define LAST2_FRAME     2
+#define LAST3_FRAME     3
+#define GOLDEN_FRAME    4
+#define BWDREF_FRAME    5
+#define ALTREF_FRAME    6
+#define MAX_REF_FRAMES  7
+#define LAST_REF_FRAMES (LAST3_FRAME - LAST_FRAME + 1)
+
 #else
-#define SINGLE_REFS 3
-#define COMP_REFS 2
+
+#define GOLDEN_FRAME    2
+#define ALTREF_FRAME    3
+#define MAX_REF_FRAMES  4
 #endif  // CONFIG_EXT_REFS
 
+#define FWD_REFS (GOLDEN_FRAME - LAST_FRAME + 1)
+#define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
+#if CONFIG_EXT_REFS
+#define BWD_REFS (ALTREF_FRAME - BWDREF_FRAME + 1)
+#define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
+#else
+#define BWD_REFS 1
+#define BWD_RF_OFFSET(ref) (ref - ALTREF_FRAME)
+#endif
+
+#define SINGLE_REFS (FWD_REFS + BWD_REFS)
+#define COMP_REFS   (FWD_REFS * BWD_REFS)
+
+#if CONFIG_REF_MV
+#define MODE_CTX_REF_FRAMES (MAX_REF_FRAMES + COMP_REFS)
+#else
+#define MODE_CTX_REF_FRAMES MAX_REF_FRAMES
+#endif
+
 #if CONFIG_SUPERTX
 #define PARTITION_SUPERTX_CONTEXTS 2
 #define MAX_SUPERTX_BLOCK_SIZE BLOCK_32X32

diff --git a/vp10/common/mvref_common.h b/vp10/common/mvref_common.h
index 96e8e89..70ef017 100644
--- a/vp10/common/mvref_common.h
+++ b/vp10/common/mvref_common.h

@@ -263,13 +263,13 @@
 static INLINE int8_t vp10_ref_frame_type(const MV_REFERENCE_FRAME *const rf) {
   if (rf[1] > INTRA_FRAME) {
     return MAX_REF_FRAMES + FWD_RF_OFFSET(rf[0]) +
-        BWD_RF_OFFSET(rf[1]) * FWD_REF_FRAMES;
+        BWD_RF_OFFSET(rf[1]) * FWD_REFS;
   }
 
   return rf[0];
 }
 
-static MV_REFERENCE_FRAME ref_frame_map[FWD_REF_FRAMES * BWD_REF_FRAMES][2] = {
+static MV_REFERENCE_FRAME ref_frame_map[COMP_REFS][2] = {
 #if CONFIG_EXT_REFS
   {LAST_FRAME, BWDREF_FRAME},
   {LAST2_FRAME, BWDREF_FRAME},

diff --git a/vp10/common/vp10_convolve.c b/vp10/common/vp10_convolve.c
index b016531..1abd159 100644
--- a/vp10/common/vp10_convolve.c
+++ b/vp10/common/vp10_convolve.c

@@ -40,7 +40,7 @@
   }
 }
 
-static void convolve_vert(const uint8_t *src, int src_stride, uint8_t *dst,
+void vp10_convolve_vert_c(const uint8_t *src, int src_stride, uint8_t *dst,
                           int dst_stride, int w, int h,
                           const InterpFilterParams filter_params,
                           const int subpel_y_q4, int y_step_q4, int avg) {
@@ -133,13 +133,13 @@
         vp10_get_interp_filter_params(interp_filter);
 #endif
     assert(filter_params.taps <= MAX_FILTER_TAP);
-    convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
-                  subpel_y_q4, y_step_q4, ref_idx);
+    vp10_convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
+                       subpel_y_q4, y_step_q4, ref_idx);
   } else {
     // temp's size is set to (maximum possible intermediate_height) *
     // MAX_BLOCK_WIDTH
     uint8_t temp[((((MAX_BLOCK_HEIGHT - 1) * MAX_STEP + 15) >> SUBPEL_BITS) +
-                  MAX_FILTER_TAP) *
+                  MAX_FILTER_TAP + 1) *
                  MAX_BLOCK_WIDTH];
     int temp_stride = MAX_BLOCK_WIDTH;
 #if CONFIG_DUAL_FILTER
@@ -164,7 +164,7 @@
     assert(filter_params.taps <= MAX_FILTER_TAP);
 
     vp10_convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride,
-                        temp, temp_stride, w, intermediate_height,
+                        temp + temp_stride, temp_stride, w, intermediate_height,
                         filter_params, subpel_x_q4, x_step_q4, 0);
 
 #if CONFIG_DUAL_FILTER
@@ -175,9 +175,9 @@
     filter_size = filter_params.taps;
     assert(filter_params.taps <= MAX_FILTER_TAP);
 
-    convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride, dst,
-                  dst_stride, w, h, filter_params,
-                  subpel_y_q4, y_step_q4, ref_idx);
+    vp10_convolve_vert(temp + temp_stride * (filter_size / 2), temp_stride,
+                       dst, dst_stride, w, h, filter_params,
+                       subpel_y_q4, y_step_q4, ref_idx);
   }
 }
 

diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index 37b5891..5a41511 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl

@@ -90,6 +90,9 @@
 add_proto qw/void vp10_convolve_horiz/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg";
 specialize qw/vp10_convolve_horiz ssse3/;
 
+add_proto qw/void vp10_convolve_vert/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg";
+specialize qw/vp10_convolve_vert ssse3/;
+
 #
 # dct
 #

diff --git a/vp10/common/x86/vp10_convolve_ssse3.c b/vp10/common/x86/vp10_convolve_ssse3.c
index 91bd2d4..472990e 100644
--- a/vp10/common/x86/vp10_convolve_ssse3.c
+++ b/vp10/common/x86/vp10_convolve_ssse3.c

@@ -37,27 +37,29 @@
   // they're zero vectors.
 }
 
-typedef void (*store_pixel_t)(__m128i x, uint8_t *src, uint8_t *dst);
+typedef void (*store_pixel_t)(const __m128i *x, uint8_t *src, uint8_t *dst);
 
-static INLINE void store_4_pixel_only(__m128i x, uint8_t *src, uint8_t *dst) {
+static INLINE void store_4_pixel_only(const __m128i *x, uint8_t *src,
+                                      uint8_t *dst) {
+  __m128i u;
   (void)src;
-  x = _mm_packus_epi16(x, x);
-  *(int *)dst = _mm_cvtsi128_si32(x);
+  u = _mm_packus_epi16(*x, *x);
+  *(int *)dst = _mm_cvtsi128_si32(u);
 }
 
-static INLINE __m128i accumulate_store(__m128i x, uint8_t *src) {
+static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   __m128i y = _mm_loadl_epi64((__m128i const *)src);
   y = _mm_unpacklo_epi8(y, zero);
-  y = _mm_add_epi16(x, y);
+  y = _mm_add_epi16(*x, y);
   y = _mm_add_epi16(y, one);
   y = _mm_srai_epi16(y, 1);
   y = _mm_packus_epi16(y, y);
   return y;
 }
 
-static INLINE void accumulate_store_4_pixel(__m128i x, uint8_t *src,
+static INLINE void accumulate_store_4_pixel(const __m128i *x, uint8_t *src,
                                             uint8_t *dst) {
   __m128i y = accumulate_store(x, src);
   *(int *)dst = _mm_cvtsi128_si32(y);
@@ -102,7 +104,7 @@
 
   sumPairRow[1] = _mm_mulhrs_epi16(sumPairRow[0], k_256);
 
-  store_func(sumPairRow[1], dst, buf);
+  store_func(&sumPairRow[1], dst, buf);
 }
 
 void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
@@ -714,3 +716,190 @@
     }
   }
 }
+
+// Vertical convolution filtering
+static INLINE void store_2_pixel_only(const __m128i *x, uint8_t *src,
+                                      uint8_t *dst) {
+  __m128i u;
+  uint32_t temp;
+  (void)src;
+  u = _mm_packus_epi16(*x, *x);
+  temp = _mm_cvtsi128_si32(u);
+  *(uint16_t *)dst = (uint16_t)temp;
+}
+
+static INLINE void accumulate_store_2_pixel(const __m128i *x, uint8_t *src,
+                                            uint8_t *dst) {
+  uint32_t temp;
+  __m128i y = accumulate_store(x, src);
+  temp = _mm_cvtsi128_si32(y);
+  *(uint16_t *)dst = (uint16_t)temp;
+}
+
+static INLINE void store_8_pixel_only(const __m128i *x, uint8_t *src,
+                                      uint8_t *dst) {
+  __m128i u;
+  (void)src;
+  u = _mm_packus_epi16(*x, *x);
+  _mm_storel_epi64((__m128i *)dst, u);
+}
+
+static INLINE void accumulate_store_8_pixel(const __m128i *x, uint8_t *src,
+                                            uint8_t *dst) {
+  __m128i y = accumulate_store(x, src);
+  _mm_storel_epi64((__m128i *)dst, y);
+}
+
+static store_pixel_t store8pixelTab[2] = {
+  store_8_pixel_only, accumulate_store_8_pixel};
+
+static store_pixel_t store2pixelTab[2] = {
+  store_2_pixel_only, accumulate_store_2_pixel};
+
+static __m128i filter_vert_ssse3(const uint8_t *src, int src_stride,
+                                 __m128i *f) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i min_x2x3, max_x2x3, sum;
+
+  __m128i s0 = _mm_loadu_si128((__m128i const *)(src));
+  __m128i s1 = _mm_loadu_si128((__m128i const *)(src + src_stride));
+  __m128i s2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  __m128i s3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+  __m128i s4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+  __m128i s5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+  __m128i s6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+  __m128i s7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+  __m128i s8 = _mm_loadu_si128((__m128i const *)(src + 8 * src_stride));
+  __m128i s9 = _mm_loadu_si128((__m128i const *)(src + 9 * src_stride));
+  __m128i s10 = _mm_loadu_si128((__m128i const *)(src + 10 * src_stride));
+  __m128i s11 = _mm_loadu_si128((__m128i const *)(src + 11 * src_stride));
+
+  s0 = _mm_unpacklo_epi8(s0, s1);
+  s2 = _mm_unpacklo_epi8(s2, s3);
+  s4 = _mm_unpacklo_epi8(s4, s5);
+  s6 = _mm_unpacklo_epi8(s6, s7);
+  s8 = _mm_unpacklo_epi8(s8, s9);
+  s10 = _mm_unpacklo_epi8(s10, s11);
+
+  s0 = _mm_maddubs_epi16(s0, f[0]);
+  s2 = _mm_maddubs_epi16(s2, f[1]);
+  s4 = _mm_maddubs_epi16(s4, f[2]);
+  s6 = _mm_maddubs_epi16(s6, f[3]);
+  s8 = _mm_maddubs_epi16(s8, f[4]);
+  s10 = _mm_maddubs_epi16(s10, f[5]);
+
+  min_x2x3 = _mm_min_epi16(s4, s6);
+  max_x2x3 = _mm_max_epi16(s4, s6);
+  sum = _mm_adds_epi16(s0, s2);
+  sum = _mm_adds_epi16(sum, s10);
+  sum = _mm_adds_epi16(sum, s8);
+
+  sum = _mm_adds_epi16(sum, min_x2x3);
+  sum = _mm_adds_epi16(sum, max_x2x3);
+
+  sum = _mm_mulhrs_epi16(sum, k_256);
+  sum = _mm_packus_epi16(sum, sum);
+  sum = _mm_unpacklo_epi8(sum, zero);
+  return sum;
+}
+
+static void filter_vert_horiz_parallel_ssse3(const uint8_t *src, int src_stride,
+                                             __m128i *f, int tapsNum,
+                                             store_pixel_t store_func,
+                                             uint8_t *dst) {
+  __m128i sum;
+
+  if (10 == tapsNum) {
+    src -= src_stride;
+  }
+
+  sum = filter_vert_ssse3(src, src_stride, f);
+  store_func(&sum, dst, dst);
+}
+
+void filter_vert_compute_small(const uint8_t *src, int src_stride, __m128i *f,
+                               int tapsNum, store_pixel_t store_func, int h,
+                               uint8_t *dst, int dst_stride) {
+  int rowIndex = 0;
+  do {
+    filter_vert_horiz_parallel_ssse3(src, src_stride, f, tapsNum, store_func,
+                                     dst);
+    rowIndex++;
+    src += src_stride;
+    dst += dst_stride;
+  } while (rowIndex < h);
+}
+
+void filter_vert_compute_large(const uint8_t *src, int src_stride, __m128i *f,
+                               int tapsNum, store_pixel_t store_func, int w,
+                               int h, uint8_t *dst, int dst_stride) {
+  int col;
+  int rowIndex = 0;
+  const uint8_t *src_ptr = src;
+  uint8_t *dst_ptr = dst;
+
+  do {
+    for (col = 0; col < w; col += 8) {
+      filter_vert_horiz_parallel_ssse3(src_ptr, src_stride, f, tapsNum,
+                                       store_func, dst_ptr);
+      src_ptr += 8;
+      dst_ptr += 8;
+    }
+    rowIndex++;
+    src_ptr = src + rowIndex * src_stride;
+    dst_ptr = dst + rowIndex * dst_stride;
+  } while (rowIndex < h);
+}
+
+void vp10_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
+                              int dst_stride, int w, int h,
+                              const InterpFilterParams filter_params,
+                              const int subpel_y_q4, int y_step_q4, int avg) {
+  __m128i verf[6];
+  SubpelFilterCoeffs vCoeffs;
+  const uint8_t *src_ptr;
+  uint8_t *dst_ptr = dst;
+  store_pixel_t store2p = store2pixelTab[avg];
+  store_pixel_t store4p = store4pixelTab[avg];
+  store_pixel_t store8p = store8pixelTab[avg];
+  const int tapsNum = filter_params.taps;
+
+  if (0 == subpel_y_q4 || 16 != y_step_q4) {
+    vp10_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+                         subpel_y_q4, y_step_q4, avg);
+    return;
+  }
+
+  vCoeffs = vp10_get_subpel_filter_ver_signal_dir(
+      filter_params, subpel_y_q4 - 1);
+
+  if (!vCoeffs) {
+    vp10_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+                         subpel_y_q4, y_step_q4, avg);
+    return;
+  }
+
+  verf[0] = *((const __m128i *)(vCoeffs));
+  verf[1] = *((const __m128i *)(vCoeffs + 1));
+  verf[2] = *((const __m128i *)(vCoeffs + 2));
+  verf[3] = *((const __m128i *)(vCoeffs + 3));
+  verf[4] = *((const __m128i *)(vCoeffs + 4));
+  verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+  src -= src_stride * ((tapsNum >> 1) - 1);
+  src_ptr = src;
+
+  if (w > 4) {
+    filter_vert_compute_large(src_ptr, src_stride, verf, tapsNum, store8p,
+                              w, h, dst_ptr, dst_stride);
+  } else if (4 == w) {
+    filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store4p,
+                              h, dst_ptr, dst_stride);
+  } else if (2 == w) {
+    filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store2p,
+                              h, dst_ptr, dst_stride);
+  } else {
+    assert(0);
+  }
+}

diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 7557ce4..ba1ca68 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c

@@ -1369,9 +1369,14 @@
 #if CONFIG_EXT_INTER
       if (mbmi->ref_frame[1] != INTRA_FRAME)
 #endif  // CONFIG_EXT_INTER
-      if (is_motvar_allowed(mbmi))
+      if (is_motvar_allowed(mbmi)) {
+        // TODO(debargha): Might want to only emit this if SEG_LVL_SKIP
+        // is not active, and assume SIMPLE_TRANSLATION in the decoder if
+        // it is active.
+        assert(mbmi->motion_variation < MOTION_VARIATIONS);
         vp10_write_token(w, vp10_motvar_tree, cm->fc->motvar_prob[bsize],
                          &motvar_encodings[mbmi->motion_variation]);
+      }
 #endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
 
 #if CONFIG_EXT_INTER

diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index 19ce4c3..caca295 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c

@@ -1810,14 +1810,20 @@
 #endif  // CONFIG_SUPERTX
       }
     } else {
-      vp10_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+      if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+        // The decoder rejects sub8x8 partitions when SEG_LVL_SKIP is set.
+        rd_cost->rate = INT_MAX;
+      } else {
+        vp10_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col,
+                                       rd_cost,
 #if CONFIG_SUPERTX
-                                     totalrate_nocoef,
+                                       totalrate_nocoef,
 #endif  // CONFIG_SUPERTX
-                                     bsize, ctx, best_rd);
+                                       bsize, ctx, best_rd);
 #if CONFIG_SUPERTX
       assert(*totalrate_nocoef >= 0);
 #endif  // CONFIG_SUPERTX
+      }
     }
   }
 

diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index df96967..7f3646c 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c

@@ -1053,7 +1053,7 @@
   }
 #endif
 
-  if (x->optimize && p->eobs[block]) {
+  if (p->eobs[block]) {
     int ctx;
 #if CONFIG_VAR_TX
     switch (tx_size) {
@@ -1234,24 +1234,15 @@
     int idx, idy;
     int block = 0;
     int step = 1 << (max_tx_size * 2);
+    vp10_get_entropy_contexts(bsize, TX_4X4, pd, ctx.ta[plane], ctx.tl[plane]);
+#else
+    const struct macroblockd_plane* const pd = &xd->plane[plane];
+    const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
+    vp10_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
 #endif
     vp10_subtract_plane(x, bsize, plane);
-
     arg.ta = ctx.ta[plane];
     arg.tl = ctx.tl[plane];
-
-    if (x->optimize) {
-#if CONFIG_VAR_TX
-      vp10_get_entropy_contexts(bsize, TX_4X4, pd,
-                                ctx.ta[plane], ctx.tl[plane]);
-#else
-      const struct macroblockd_plane* const pd = &xd->plane[plane];
-      const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
-      vp10_get_entropy_contexts(bsize, tx_size, pd,
-                                ctx.ta[plane], ctx.tl[plane]);
-#endif
-    }
-
 #if CONFIG_VAR_TX
     for (idy = 0; idy < mi_height; idy += bh) {
       for (idx = 0; idx < mi_width; idx += bh) {
@@ -1351,7 +1342,7 @@
 #endif  // CONFIG_NEW_QUANT
   a = &args->ta[blk_col];
   l = &args->tl[blk_row];
-  if (x->optimize && p->eobs[block]) {
+  if (p->eobs[block]) {
     int ctx;
     ctx = combine_entropy_contexts(*a, *l);
     *a = *l = vp10_optimize_b(x, plane, block, tx_size, ctx) > 0;
@@ -1388,7 +1379,7 @@
 
   struct encode_b_args arg = {x, NULL, &xd->mi[0]->mbmi.skip, ta, tl};
 
-  if (enable_optimize_b && x->optimize) {
+  if (enable_optimize_b) {
     const struct macroblockd_plane* const pd = &xd->plane[plane];
     const TX_SIZE tx_size = plane ? get_uv_tx_size(&xd->mi[0]->mbmi, pd) :
         xd->mi[0]->mbmi.tx_size;

diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index 97a7299..438cbb5 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c

@@ -3384,8 +3384,6 @@
   BufferPool *const pool = cm->buffer_pool;
   const int use_upsampled_ref = cpi->sf.use_upsampled_references;
   int new_uidx = 0;
-#if CONFIG_EXT_REFS
-#endif  // CONFIG_EXT_REFS
 
   // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
   //       for the purpose to verify no mismatch between encoder and decoder.

diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 13ad7fe..78f1c46 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c

@@ -10214,6 +10214,7 @@
   mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
 #endif  // CONFIG_EXT_INTRA
   mbmi->mode = ZEROMV;
+  mbmi->motion_variation = SIMPLE_TRANSLATION;
   mbmi->uv_mode = DC_PRED;
   mbmi->ref_frame[0] = LAST_FRAME;
   mbmi->ref_frame[1] = NONE;