Merge "SSSE3 assembly for 4x4/8x8/16x16/32x32 H intra prediction."
diff --git a/build/make/gen_msvs_proj.sh b/build/make/gen_msvs_proj.sh
index cff27c8..fc5011b 100755
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -381,7 +381,7 @@
                             RuntimeLibrary="$debug_runtime" \
                             UsePrecompiledHeader="0" \
                             WarningLevel="3" \
-                            DebugInformationFormat="1" \
+                            DebugInformationFormat="2" \
                             $warn_64bit \
 
                         $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"
@@ -395,7 +395,7 @@
                             RuntimeLibrary="$debug_runtime" \
                             UsePrecompiledHeader="0" \
                             WarningLevel="3" \
-                            DebugInformationFormat="1" \
+                            DebugInformationFormat="2" \
                             $warn_64bit \
 
                         $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"
diff --git a/configure b/configure
index 28676fb..36fc08f 100755
--- a/configure
+++ b/configure
@@ -682,6 +682,14 @@
             # iOS/ARM builds do not work with gtest. This does not match
             # x86 targets.
         ;;
+        *-win*)
+            # Some mingw toolchains don't have pthread available by default.
+            # Treat these more like visual studio where threading in gtest
+            # would be disabled for the same reason.
+            check_cxx "$@" <<EOF && soft_enable unit_tests
+int z;
+EOF
+        ;;
         *)
             enabled pthread_h && check_cxx "$@" <<EOF && soft_enable unit_tests
 int z;
diff --git a/libs.mk b/libs.mk
index f7d8395..4aa7dc4 100644
--- a/libs.mk
+++ b/libs.mk
@@ -448,6 +448,10 @@
 include $(SRC_PATH_BARE)/third_party/googletest/gtest.mk
 GTEST_SRCS := $(addprefix third_party/googletest/src/,$(call enabled,GTEST_SRCS))
 GTEST_OBJS=$(call objs,$(GTEST_SRCS))
+ifeq ($(filter win%,$(TGT_OS)),$(TGT_OS))
+# Disabling pthreads globally will cause issues on darwin and possibly elsewhere
+$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -DGTEST_HAS_PTHREAD=0
+endif
 $(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src
 $(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include
 OBJS-$(BUILD_LIBVPX) += $(GTEST_OBJS)
@@ -472,7 +476,7 @@
         lib$(CODEC_LIB)$(CODEC_LIB_SUF) libgtest.a ))\
     $(if $(BUILD_LIBVPX),$(eval $(call linkerxx_template,$(bin),\
         $(LIBVPX_TEST_OBJS) \
-        -L. -lvpx -lgtest -lpthread -lm)\
+        -L. -lvpx -lgtest $(extralibs) -lm)\
         )))\
     $(if $(LIPO_LIBS),$(eval $(call lipo_bin_template,$(bin))))\
 
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 9fb45d6..13b1dc8 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -13,6 +13,7 @@
 #include <string.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vpx_ports/mem.h"
 
 extern "C" {
 #include "vp9/common/vp9_entropy.h"
@@ -264,6 +265,132 @@
   }
 }
 
+void fdct16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+               int stride, int /*tx_type*/) {
+  vp9_short_fdct16x16_c(in, out, stride);
+}
+void idct16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+                   int stride, int /*tx_type*/) {
+  vp9_short_idct16x16_add_c(out, dst, stride >> 1);
+}
+void fht16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+              int stride, int tx_type) {
+  // FIXME(jingning): patch dependency on SSE2 16x16 hybrid transform coding
+#if HAVE_SSE2 && 0
+  vp9_short_fht16x16_sse2(in, out, stride >> 1, tx_type);
+#else
+  vp9_short_fht16x16_c(in, out, stride >> 1, tx_type);
+#endif
+}
+void iht16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+                  int stride, int tx_type) {
+  vp9_short_iht16x16_add_c(out, dst, stride >> 1, tx_type);
+}
+
+class FwdTrans16x16Test : public ::testing::TestWithParam<int> {
+ public:
+  FwdTrans16x16Test() { SetUpTestTxfm(); }
+  ~FwdTrans16x16Test() {}
+
+  void SetUpTestTxfm() {
+    tx_type_ = GetParam();
+    if (tx_type_ == 0) {
+      fwd_txfm = fdct16x16;
+      inv_txfm = idct16x16_add;
+    } else {
+      fwd_txfm = fht16x16;
+      inv_txfm = iht16x16_add;
+    }
+  }
+
+ protected:
+  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+                  int stride, int tx_type) {
+    (*fwd_txfm)(in, out, dst, stride, tx_type);
+  }
+  void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+                  int stride, int tx_type) {
+    (*inv_txfm)(in, out, dst, stride, tx_type);
+  }
+
+  int tx_type_;
+  void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+  void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+};
+
+TEST_P(FwdTrans16x16Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int max_error = 0;
+  double total_error = 0;
+  const int count_test_block = 10000;
+  for (int i = 0; i < count_test_block; ++i) {
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 256);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 256);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 256);
+
+    for (int j = 0; j < 256; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 256; ++j)
+      test_input_block[j] = src[j] - dst[j];
+
+    const int pitch = 32;
+    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+
+    for (int j = 0; j < 256; ++j) {
+      const int diff = dst[j] - src[j];
+      const int error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+      total_error += error;
+    }
+  }
+
+  EXPECT_GE(1, max_error)
+      << "Error: 16x16 FHT/IHT has an individual round trip error > 1";
+
+  EXPECT_GE(count_test_block , total_error)
+      << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
+}
+
+TEST_P(FwdTrans16x16Test, CoeffSizeCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, 256);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, 256);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, 256);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_extreme_block, 256);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256);
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 256; ++j) {
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+    }
+    if (i == 0)
+      for (int j = 0; j < 256; ++j)
+        input_extreme_block[j] = 255;
+
+    const int pitch = 32;
+    RunFwdTxfm(input_block, output_block, dst, pitch, tx_type_);
+    RunFwdTxfm(input_extreme_block, output_extreme_block, dst, pitch, tx_type_);
+
+    // The minimum quant value is 4.
+    for (int j = 0; j < 256; ++j) {
+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
+          << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
+          << "Error: 16x16 FDCT extreme has coefficient larger than 4*DCT_MAX_VALUE";
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(VP9, FwdTrans16x16Test, ::testing::Range(0, 4));
 
 TEST(VP9Idct16x16Test, AccuracyCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -295,72 +422,4 @@
   }
 }
 
-// we need enable fdct test once we re-do the 16 point fdct.
-TEST(VP9Fdct16x16Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int max_error = 0;
-  double total_error = 0;
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t test_input_block[256];
-    int16_t test_temp_block[256];
-    uint8_t dst[256], src[256];
-
-    for (int j = 0; j < 256; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 256; ++j)
-      test_input_block[j] = src[j] - dst[j];
-
-    const int pitch = 32;
-    vp9_short_fdct16x16_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct16x16_add_c(test_temp_block, dst, 16);
-
-    for (int j = 0; j < 256; ++j) {
-      const int diff = dst[j] - src[j];
-      const int error = diff * diff;
-      if (max_error < error)
-        max_error = error;
-      total_error += error;
-    }
-  }
-
-  EXPECT_GE(1, max_error)
-      << "Error: 16x16 FDCT/IDCT has an individual round trip error > 1";
-
-  EXPECT_GE(count_test_block , total_error)
-      << "Error: 16x16 FDCT/IDCT has average round trip error > 1 per block";
-}
-
-TEST(VP9Fdct16x16Test, CoeffSizeCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t input_block[256], input_extreme_block[256];
-    int16_t output_block[256], output_extreme_block[256];
-
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 256; ++j) {
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
-      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
-    }
-    if (i == 0)
-      for (int j = 0; j < 256; ++j)
-        input_extreme_block[j] = 255;
-
-    const int pitch = 32;
-    vp9_short_fdct16x16_c(input_block, output_block, pitch);
-    vp9_short_fdct16x16_c(input_extreme_block, output_extreme_block, pitch);
-
-    // The minimum quant value is 4.
-    for (int j = 0; j < 256; ++j) {
-      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
-          << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
-          << "Error: 16x16 FDCT extreme has coefficient larger than 4*DCT_MAX_VALUE";
-    }
-  }
-}
 }  // namespace
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 7c7ae6d..16244e0 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -391,10 +391,9 @@
 
 static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize,
                                    PARTITION_TYPE partition) {
-  BLOCK_SIZE_TYPE subsize;
+  BLOCK_SIZE_TYPE subsize = bsize;
   switch (partition) {
     case PARTITION_NONE:
-      subsize = bsize;
       break;
     case PARTITION_HORZ:
       if (bsize == BLOCK_SIZE_SB64X64)
diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
index 46ae503..914afa7 100644
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -217,12 +217,13 @@
    * h == 64, taps == 8.
    */
   uint8_t temp[64 * 135];
-  int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1;
+  int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1;
 
   assert(w <= 64);
   assert(h <= 64);
   assert(taps <= 8);
   assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
 
   if (intermediate_height < h)
     intermediate_height = h;
@@ -246,12 +247,13 @@
    * h == 64, taps == 8.
    */
   uint8_t temp[64 * 135];
-  int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1;
+  int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1;
 
   assert(w <= 64);
   assert(h <= 64);
   assert(taps <= 8);
   assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
 
   if (intermediate_height < h)
     intermediate_height = h;
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index a62fa47..50e8463 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -85,7 +85,7 @@
 #define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)
 
 MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
-  MV_CLASS_TYPE c;
+  MV_CLASS_TYPE c = MV_CLASS_0;
   if      (z < CLASS0_SIZE * 8)    c = MV_CLASS_0;
   else if (z < CLASS0_SIZE * 16)   c = MV_CLASS_1;
   else if (z < CLASS0_SIZE * 32)   c = MV_CLASS_2;
diff --git a/vp9/common/vp9_maskingmv.c b/vp9/common/vp9_maskingmv.c
deleted file mode 100644
index 326201b..0000000
--- a/vp9/common/vp9_maskingmv.c
+++ /dev/null
@@ -1,803 +0,0 @@
-/*
- ============================================================================
- Name        : vp9_maskingmv.c
- Author      : jimbankoski
- Version     :
- Copyright   : Your copyright notice
- Description : Hello World in C, Ansi-style
- ============================================================================
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-unsigned int vp9_sad16x16_sse3(
-  unsigned char *src_ptr,
-  int  src_stride,
-  unsigned char *ref_ptr,
-  int  ref_stride,
-  int  max_err);
-
-int vp8_growmaskmb_sse3(
-  unsigned char *om,
-  unsigned char *nm);
-
-void vp8_makemask_sse3(
-  unsigned char *y,
-  unsigned char *u,
-  unsigned char *v,
-  unsigned char *ym,
-  int yp,
-  int uvp,
-  int ys,
-  int us,
-  int vs,
-  int yt,
-  int ut,
-  int vt);
-
-unsigned int vp9_sad16x16_unmasked_wmt(
-  unsigned char *src_ptr,
-  int  src_stride,
-  unsigned char *ref_ptr,
-  int  ref_stride,
-  unsigned char *mask);
-
-unsigned int vp9_sad16x16_masked_wmt(
-  unsigned char *src_ptr,
-  int  src_stride,
-  unsigned char *ref_ptr,
-  int  ref_stride,
-  unsigned char *mask);
-
-unsigned int vp8_masked_predictor_wmt(
-  unsigned char *masked,
-  unsigned char *unmasked,
-  int  src_stride,
-  unsigned char *dst_ptr,
-  int  dst_stride,
-  unsigned char *mask);
-unsigned int vp8_masked_predictor_uv_wmt(
-  unsigned char *masked,
-  unsigned char *unmasked,
-  int  src_stride,
-  unsigned char *dst_ptr,
-  int  dst_stride,
-  unsigned char *mask);
-unsigned int vp8_uv_from_y_mask(
-  unsigned char *ymask,
-  unsigned char *uvmask);
-int yp = 16;
-unsigned char sxy[] = {
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90
-};
-
-unsigned char sts[] = {
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-};
-unsigned char str[] = {
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-};
-
-unsigned char y[] = {
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40
-};
-int uvp = 8;
-unsigned char u[] = {
-  90, 80, 70, 70, 90, 90, 90, 17,
-  90, 80, 70, 70, 90, 90, 90, 17,
-  84, 70, 70, 90, 90, 90, 17, 17,
-  84, 70, 70, 90, 90, 90, 17, 17,
-  80, 70, 70, 90, 90, 90, 17, 17,
-  90, 80, 70, 70, 90, 90, 90, 17,
-  90, 80, 70, 70, 90, 90, 90, 17,
-  90, 80, 70, 70, 90, 90, 90, 17
-};
-
-unsigned char v[] = {
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80,
-  80, 80, 80, 80, 80, 80, 80, 80
-};
-
-unsigned char ym[256];
-unsigned char uvm[64];
-typedef struct {
-  unsigned char y;
-  unsigned char yt;
-  unsigned char u;
-  unsigned char ut;
-  unsigned char v;
-  unsigned char vt;
-  unsigned char use;
-} COLOR_SEG_ELEMENT;
-
-/*
-COLOR_SEG_ELEMENT segmentation[]=
-{
-    { 60,4,80,17,80,10, 1},
-    { 40,4,15,10,80,10, 1},
-};
-*/
-
-COLOR_SEG_ELEMENT segmentation[] = {
-  { 79, 44, 92, 44, 237, 60, 1},
-};
-
-unsigned char pixel_mask(unsigned char y, unsigned char u, unsigned char v,
-                         COLOR_SEG_ELEMENT sgm[],
-                         int c) {
-  COLOR_SEG_ELEMENT *s = sgm;
-  unsigned char m = 0;
-  int i;
-  for (i = 0; i < c; i++, s++)
-    m |= (abs(y - s->y) < s->yt &&
-          abs(u - s->u) < s->ut &&
-          abs(v - s->v) < s->vt ? 255 : 0);
-
-  return m;
-}
-int neighbors[256][8];
-int makeneighbors(void) {
-  int i, j;
-  for (i = 0; i < 256; i++) {
-    int r = (i >> 4), c = (i & 15);
-    int ni = 0;
-    for (j = 0; j < 8; j++)
-      neighbors[i][j] = i;
-    for (j = 0; j < 256; j++) {
-      int nr = (j >> 4), nc = (j & 15);
-      if (abs(nr - r) < 2 && abs(nc - c) < 2)
-        neighbors[i][ni++] = j;
-    }
-  }
-  return 0;
-}
-void grow_ymask(unsigned char *ym) {
-  unsigned char nym[256];
-  int i, j;
-
-  for (i = 0; i < 256; i++) {
-    nym[i] = ym[i];
-    for (j = 0; j < 8; j++) {
-      nym[i] |= ym[neighbors[i][j]];
-    }
-  }
-  for (i = 0; i < 256; i++)
-    ym[i] = nym[i];
-}
-
-void make_mb_mask(unsigned char *y, unsigned char *u, unsigned char *v,
-                  unsigned char *ym, unsigned char *uvm,
-                  int yp, int uvp,
-                  COLOR_SEG_ELEMENT sgm[],
-                  int count) {
-  int r, c;
-  unsigned char *oym = ym;
-
-  memset(ym, 20, 256);
-  for (r = 0; r < 8; r++, uvm += 8, u += uvp, v += uvp, y += (yp + yp), ym += 32)
-    for (c = 0; c < 8; c++) {
-      int y1 = y[c << 1];
-      int u1 = u[c];
-      int v1 = v[c];
-      int m = pixel_mask(y1, u1, v1, sgm, count);
-      uvm[c] = m;
-      ym[c << 1] = uvm[c]; // = pixel_mask(y[c<<1],u[c],v[c],sgm,count);
-      ym[(c << 1) + 1] = pixel_mask(y[1 + (c << 1)], u[c], v[c], sgm, count);
-      ym[(c << 1) + 16] = pixel_mask(y[yp + (c << 1)], u[c], v[c], sgm, count);
-      ym[(c << 1) + 17] = pixel_mask(y[1 + yp + (c << 1)], u[c], v[c], sgm, count);
-    }
-  grow_ymask(oym);
-}
-
-int masked_sad(unsigned char *src, int p, unsigned char *dst, int dp,
-               unsigned char *ym) {
-  int i, j;
-  unsigned sad = 0;
-  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)
-    for (j = 0; j < 16; j++)
-      if (ym[j])
-        sad += abs(src[j] - dst[j]);
-
-  return sad;
-}
-
-int compare_masks(unsigned char *sym, unsigned char *ym) {
-  int i, j;
-  unsigned sad = 0;
-  for (i = 0; i < 16; i++, sym += 16, ym += 16)
-    for (j = 0; j < 16; j++)
-      sad += (sym[j] != ym[j] ? 1 : 0);
-
-  return sad;
-}
-
-int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp,
-                 unsigned char *ym) {
-  int i, j;
-  unsigned sad = 0;
-  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)
-    for (j = 0; j < 16; j++)
-      if (!ym[j])
-        sad += abs(src[j] - dst[j]);
-
-  return sad;
-}
-
-int masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,
-                         int yp, int uvp,
-                         unsigned char *dy, unsigned char *du, unsigned char *dv,
-                         int dyp, int duvp,
-                         COLOR_SEG_ELEMENT sgm[],
-                         int count,
-                         int *mi,
-                         int *mj,
-                         int *ui,
-                         int *uj,
-                         int *wm) {
-  int i, j;
-
-  unsigned char ym[256];
-  unsigned char uvm[64];
-  unsigned char dym[256];
-  unsigned char duvm[64];
-  unsigned int e = 0;
-  int beste = 256;
-  int bmi = -32, bmj = -32;
-  int bui = -32, buj = -32;
-  int beste1 = 256;
-  int bmi1 = -32, bmj1 = -32;
-  int bui1 = -32, buj1 = -32;
-  int obeste;
-
-  // first try finding best mask and then unmasked
-  beste = 0xffffffff;
-
-  // find best unmasked mv
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    unsigned char *duz = i / 2 * duvp + du;
-    unsigned char *dvz = i / 2 * duvp + dv;
-    for (j = -32; j < 32; j++) {
-      // 0,0  masked destination
-      make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);
-
-      e = unmasked_sad(y, yp, dyz + j, dyp, dym);
-
-      if (e < beste) {
-        bui = i;
-        buj = j;
-        beste = e;
-      }
-    }
-  }
-  // bui=0;buj=0;
-  // best mv masked destination
-  make_mb_mask(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,
-               dym, duvm, dyp, duvp, sgm, count);
-
-  obeste = beste;
-  beste = 0xffffffff;
-
-  // find best masked
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    for (j = -32; j < 32; j++) {
-      e = masked_sad(y, yp, dyz + j, dyp, dym);
-
-      if (e < beste) {
-        bmi = i;
-        bmj = j;
-        beste = e;
-      }
-    }
-  }
-  beste1 = beste + obeste;
-  bmi1 = bmi;
-  bmj1 = bmj;
-  bui1 = bui;
-  buj1 = buj;
-
-  beste = 0xffffffff;
-  // source mask
-  make_mb_mask(y, u, v, ym, uvm, yp, uvp, sgm, count);
-
-  // find best mask
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    unsigned char *duz = i / 2 * duvp + du;
-    unsigned char *dvz = i / 2 * duvp + dv;
-    for (j = -32; j < 32; j++) {
-      // 0,0  masked destination
-      make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);
-
-      e = compare_masks(ym, dym);
-
-      if (e < beste) {
-        bmi = i;
-        bmj = j;
-        beste = e;
-      }
-    }
-  }
-
-
-  // best mv masked destination
-  make_mb_mask(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,
-               dym, duvm, dyp, duvp, sgm, count);
-
-  obeste = masked_sad(y, yp, dy + bmi * dyp + bmj, dyp, dym);
-
-  beste = 0xffffffff;
-
-  // find best unmasked mv
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    for (j = -32; j < 32; j++) {
-      e = unmasked_sad(y, yp, dyz + j, dyp, dym);
-
-      if (e < beste) {
-        bui = i;
-        buj = j;
-        beste = e;
-      }
-    }
-  }
-  beste += obeste;
-
-
-  if (beste < beste1) {
-    *mi = bmi;
-    *mj = bmj;
-    *ui = bui;
-    *uj = buj;
-    *wm = 1;
-  } else {
-    *mi = bmi1;
-    *mj = bmj1;
-    *ui = bui1;
-    *uj = buj1;
-    *wm = 0;
-
-  }
-  return 0;
-}
-
-int predict(unsigned char *src, int p, unsigned char *dst, int dp,
-            unsigned char *ym, unsigned char *prd) {
-  int i, j;
-  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16, prd += 16)
-    for (j = 0; j < 16; j++)
-      prd[j] = (ym[j] ? src[j] : dst[j]);
-  return 0;
-}
-
-int fast_masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,
-                              int yp, int uvp,
-                              unsigned char *dy, unsigned char *du, unsigned char *dv,
-                              int dyp, int duvp,
-                              COLOR_SEG_ELEMENT sgm[],
-                              int count,
-                              int *mi,
-                              int *mj,
-                              int *ui,
-                              int *uj,
-                              int *wm) {
-  int i, j;
-
-  unsigned char ym[256];
-  unsigned char ym2[256];
-  unsigned char uvm[64];
-  unsigned char dym2[256];
-  unsigned char dym[256];
-  unsigned char duvm[64];
-  unsigned int e = 0;
-  int beste = 256;
-  int bmi = -32, bmj = -32;
-  int bui = -32, buj = -32;
-  int beste1 = 256;
-  int bmi1 = -32, bmj1 = -32;
-  int bui1 = -32, buj1 = -32;
-  int obeste;
-
-  // first try finding best mask and then unmasked
-  beste = 0xffffffff;
-
-#if 0
-  for (i = 0; i < 16; i++) {
-    unsigned char *dy = i * yp + y;
-    for (j = 0; j < 16; j++)
-      printf("%2x", dy[j]);
-    printf("\n");
-  }
-  printf("\n");
-
-  for (i = -32; i < 48; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    for (j = -32; j < 48; j++)
-      printf("%2x", dyz[j]);
-    printf("\n");
-  }
-#endif
-
-  // find best unmasked mv
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    unsigned char *duz = i / 2 * duvp + du;
-    unsigned char *dvz = i / 2 * duvp + dv;
-    for (j = -32; j < 32; j++) {
-      // 0,0  masked destination
-      vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,
-                        sgm[0].y, sgm[0].u, sgm[0].v,
-                        sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
-      vp8_growmaskmb_sse3(dym, dym2);
-
-      e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);
-
-      if (e < beste) {
-        bui = i;
-        buj = j;
-        beste = e;
-      }
-    }
-  }
-  // bui=0;buj=0;
-  // best mv masked destination
-
-  vp8_makemask_sse3(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,
-                    dym, dyp, duvp,
-                    sgm[0].y, sgm[0].u, sgm[0].v,
-                    sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
-  vp8_growmaskmb_sse3(dym, dym2);
-
-  obeste = beste;
-  beste = 0xffffffff;
-
-  // find best masked
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    for (j = -32; j < 32; j++) {
-      e = vp9_sad16x16_masked_wmt(y, yp, dyz + j, dyp, dym2);
-      if (e < beste) {
-        bmi = i;
-        bmj = j;
-        beste = e;
-      }
-    }
-  }
-  beste1 = beste + obeste;
-  bmi1 = bmi;
-  bmj1 = bmj;
-  bui1 = bui;
-  buj1 = buj;
-
-  // source mask
-  vp8_makemask_sse3(y, u, v,
-                    ym, yp, uvp,
-                    sgm[0].y, sgm[0].u, sgm[0].v,
-                    sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
-  vp8_growmaskmb_sse3(ym, ym2);
-
-  // find best mask
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    unsigned char *duz = i / 2 * duvp + du;
-    unsigned char *dvz = i / 2 * duvp + dv;
-    for (j = -32; j < 32; j++) {
-      // 0,0  masked destination
-      vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,
-                        sgm[0].y, sgm[0].u, sgm[0].v,
-                        sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
-      vp8_growmaskmb_sse3(dym, dym2);
-
-      e = compare_masks(ym2, dym2);
-
-      if (e < beste) {
-        bmi = i;
-        bmj = j;
-        beste = e;
-      }
-    }
-  }
-
-  vp8_makemask_sse3(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,
-                    dym, dyp, duvp,
-                    sgm[0].y, sgm[0].u, sgm[0].v,
-                    sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
-  vp8_growmaskmb_sse3(dym, dym2);
-
-  obeste = vp9_sad16x16_masked_wmt(y, yp, dy + bmi * dyp + bmj, dyp, dym2);
-
-  beste = 0xffffffff;
-
-  // find best unmasked mv
-  for (i = -32; i < 32; i++) {
-    unsigned char *dyz = i * dyp + dy;
-    for (j = -32; j < 32; j++) {
-      e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);
-
-      if (e < beste) {
-        bui = i;
-        buj = j;
-        beste = e;
-      }
-    }
-  }
-  beste += obeste;
-
-  if (beste < beste1) {
-    *mi = bmi;
-    *mj = bmj;
-    *ui = bui;
-    *uj = buj;
-    *wm = 1;
-  } else {
-    *mi = bmi1;
-    *mj = bmj1;
-    *ui = bui1;
-    *uj = buj1;
-    *wm = 0;
-    beste = beste1;
-
-  }
-  return beste;
-}
-
-int predict_all(unsigned char *ym, unsigned char *um, unsigned char *vm,
-                int ymp, int uvmp,
-                unsigned char *yp, unsigned char *up, unsigned char *vp,
-                int ypp, int uvpp,
-                COLOR_SEG_ELEMENT sgm[],
-                int count,
-                int mi,
-                int mj,
-                int ui,
-                int uj,
-                int wm) {
-  int i, j;
-  unsigned char dym[256];
-  unsigned char dym2[256];
-  unsigned char duvm[64];
-  unsigned char *yu = ym, *uu = um, *vu = vm;
-
-  unsigned char *dym3 = dym2;
-
-  ym += mi * ymp + mj;
-  um += mi / 2 * uvmp + mj / 2;
-  vm += mi / 2 * uvmp + mj / 2;
-
-  yu += ui * ymp + uj;
-  uu += ui / 2 * uvmp + uj / 2;
-  vu += ui / 2 * uvmp + uj / 2;
-
-  // best mv masked destination
-  if (wm)
-    vp8_makemask_sse3(ym, um, vm, dym, ymp, uvmp,
-                      sgm[0].y, sgm[0].u, sgm[0].v,
-                      sgm[0].yt, sgm[0].ut, sgm[0].vt);
-  else
-    vp8_makemask_sse3(yu, uu, vu, dym, ymp, uvmp,
-                      sgm[0].y, sgm[0].u, sgm[0].v,
-                      sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
-  vp8_growmaskmb_sse3(dym, dym2);
-  vp8_masked_predictor_wmt(ym, yu, ymp, yp, ypp, dym3);
-  vp8_uv_from_y_mask(dym3, duvm);
-  vp8_masked_predictor_uv_wmt(um, uu, uvmp, up, uvpp, duvm);
-  vp8_masked_predictor_uv_wmt(vm, vu, uvmp, vp, uvpp, duvm);
-
-  return 0;
-}
-
-unsigned char f0p[1280 * 720 * 3 / 2];
-unsigned char f1p[1280 * 720 * 3 / 2];
-unsigned char prd[1280 * 720 * 3 / 2];
-unsigned char msk[1280 * 720 * 3 / 2];
-
-
-int mainz(int argc, char *argv[]) {
-
-  FILE *f = fopen(argv[1], "rb");
-  FILE *g = fopen(argv[2], "wb");
-  int w = atoi(argv[3]), h = atoi(argv[4]);
-  int y_stride = w, uv_stride = w / 2;
-  int r, c;
-  unsigned char *f0 = f0p, *f1 = f1p, *t;
-  unsigned char ym[256], uvm[64];
-  unsigned char ym2[256], uvm2[64];
-  unsigned char ym3[256], uvm3[64];
-  int a, b;
-
-  COLOR_SEG_ELEMENT last = { 20, 20, 20, 20, 230, 20, 1}, best;
-#if 0
-  makeneighbors();
-  COLOR_SEG_ELEMENT segmentation[] = {
-    { 60, 4, 80, 17, 80, 10, 1},
-    { 40, 4, 15, 10, 80, 10, 1},
-  };
-  make_mb_mask(y, u, v, ym2, uvm2, 16, 8, segmentation, 1);
-
-  vp8_makemask_sse3(y, u, v, ym, (int) 16, (int) 8,
-                    (int) segmentation[0].y, (int) segmentation[0].u, (int) segmentation[0].v,
-                    segmentation[0].yt, segmentation[0].ut, segmentation[0].vt);
-
-  vp8_growmaskmb_sse3(ym, ym3);
-
-  a = vp9_sad16x16_masked_wmt(str, 16, sts, 16, ym3);
-  b = vp9_sad16x16_unmasked_wmt(str, 16, sts, 16, ym3);
-
-  vp8_masked_predictor_wmt(str, sts, 16, ym, 16, ym3);
-
-  vp8_uv_from_y_mask(ym3, uvm3);
-
-  return 4;
-#endif
-  makeneighbors();
-
-
-  memset(prd, 128, w * h * 3 / 2);
-
-  fread(f0, w * h * 3 / 2, 1, f);
-
-  while (!feof(f)) {
-    unsigned char *ys = f1, *yd = f0, *yp = prd;
-    unsigned char *us = f1 + w * h, *ud = f0 + w * h, *up = prd + w * h;
-    unsigned char *vs = f1 + w * h * 5 / 4, *vd = f0 + w * h * 5 / 4, *vp = prd + w * h * 5 / 4;
-    fread(f1, w * h * 3 / 2, 1, f);
-
-    ys += 32 * y_stride;
-    yd += 32 * y_stride;
-    yp += 32 * y_stride;
-    us += 16 * uv_stride;
-    ud += 16 * uv_stride;
-    up += 16 * uv_stride;
-    vs += 16 * uv_stride;
-    vd += 16 * uv_stride;
-    vp += 16 * uv_stride;
-    for (r = 32; r < h - 32; r += 16,
-         ys += 16 * w, yd += 16 * w, yp += 16 * w,
-         us += 8 * uv_stride, ud += 8 * uv_stride, up += 8 * uv_stride,
-         vs += 8 * uv_stride, vd += 8 * uv_stride, vp += 8 * uv_stride) {
-      for (c = 32; c < w - 32; c += 16) {
-        int mi, mj, ui, uj, wm;
-        int bmi, bmj, bui, buj, bwm;
-        unsigned char ym[256];
-
-        if (vp9_sad16x16_sse3(ys + c, y_stride, yd + c, y_stride, 0xffff) == 0)
-          bmi = bmj = bui = buj = bwm = 0;
-        else {
-          COLOR_SEG_ELEMENT cs[5];
-          int j;
-          unsigned int beste = 0xfffffff;
-          unsigned int bestj = 0;
-
-          // try color from last mb segmentation
-          cs[0] = last;
-
-          // try color segs from 4 pixels in mb recon as segmentation
-          cs[1].y = yd[c + y_stride + 1];
-          cs[1].u = ud[c / 2 + uv_stride];
-          cs[1].v = vd[c / 2 + uv_stride];
-          cs[1].yt = cs[1].ut = cs[1].vt = 20;
-          cs[2].y = yd[c + w + 14];
-          cs[2].u = ud[c / 2 + uv_stride + 7];
-          cs[2].v = vd[c / 2 + uv_stride + 7];
-          cs[2].yt = cs[2].ut = cs[2].vt = 20;
-          cs[3].y = yd[c + w * 14 + 1];
-          cs[3].u = ud[c / 2 + uv_stride * 7];
-          cs[3].v = vd[c / 2 + uv_stride * 7];
-          cs[3].yt = cs[3].ut = cs[3].vt = 20;
-          cs[4].y = yd[c + w * 14 + 14];
-          cs[4].u = ud[c / 2 + uv_stride * 7 + 7];
-          cs[4].v = vd[c / 2 + uv_stride * 7 + 7];
-          cs[4].yt = cs[4].ut = cs[4].vt = 20;
-
-          for (j = 0; j < 5; j++) {
-            int e;
-
-            e = fast_masked_motion_search(
-                  ys + c, us + c / 2, vs + c / 2, y_stride, uv_stride,
-                  yd + c, ud + c / 2, vd + c / 2, y_stride, uv_stride,
-                  &cs[j], 1, &mi, &mj, &ui, &uj, &wm);
-
-            if (e < beste) {
-              bmi = mi;
-              bmj = mj;
-              bui = ui;
-              buj = uj, bwm = wm;
-              bestj = j;
-              beste = e;
-            }
-          }
-          best = cs[bestj];
-          // best = segmentation[0];
-          last = best;
-        }
-        predict_all(yd + c, ud + c / 2, vd + c / 2, w, uv_stride,
-                    yp + c, up + c / 2, vp + c / 2, w, uv_stride,
-                    &best, 1, bmi, bmj, bui, buj, bwm);
-
-      }
-    }
-    fwrite(prd, w * h * 3 / 2, 1, g);
-    t = f0;
-    f0 = f1;
-    f1 = t;
-
-  }
-  fclose(f);
-  fclose(g);
-  return 0;
-}
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index f331a8f..265a19a 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -365,17 +365,16 @@
   MACROBLOCKD * const xd = arg->xd;
   const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
   const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
-  const int bh = 4 << bhl, bw = 4 << bwl;
   const int x = 4 * (block & ((1 << bwl) - 1)), y = 4 * (block >> bwl);
   const int use_second_ref = xd->mode_info_context->mbmi.ref_frame[1] > 0;
   int which_mv;
 
-  assert(x < bw);
-  assert(y < bh);
+  assert(x < (4 << bwl));
+  assert(y < (4 << bhl));
   assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 ||
-         4 << pred_w == bw);
+         4 << pred_w == (4 << bwl));
   assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 ||
-         4 << pred_h == bh);
+         4 << pred_h == (4 << bhl));
 
   for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
     // source
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 0ecfa6e..d861a7a 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -744,7 +744,7 @@
 specialize vp9_short_fht8x8 sse2
 
 prototype void vp9_short_fht16x16 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
-specialize vp9_short_fht16x16
+specialize vp9_short_fht16x16 sse2
 
 prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct8x8 sse2
diff --git a/vp9/common/x86/vp9_mask_sse3.asm b/vp9/common/x86/vp9_mask_sse3.asm
deleted file mode 100644
index fe46823..0000000
--- a/vp9/common/x86/vp9_mask_sse3.asm
+++ /dev/null
@@ -1,484 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void int vp8_makemask_sse3(
-;    unsigned char *y,
-;    unsigned char *u,
-;    unsigned char *v,
-;    unsigned char *ym,
-;    unsigned char *uvm,
-;    int yp,
-;    int uvp,
-;    int ys,
-;    int us,
-;    int vs,
-;    int yt,
-;    int ut,
-;    int vt)
-global sym(vp8_makemask_sse3) PRIVATE
-sym(vp8_makemask_sse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 14
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;y
-        mov             rdi,        arg(1) ;u
-        mov             rcx,        arg(2) ;v
-        mov             rax,        arg(3) ;ym
-        movsxd          rbx,        dword arg(4) ;yp
-        movsxd          rdx,        dword arg(5) ;uvp
-
-        pxor            xmm0,xmm0
-
-        ;make 16 copies of the center y value
-        movd            xmm1, arg(6)
-        pshufb          xmm1, xmm0
-
-        ; make 16 copies of the center u value
-        movd            xmm2, arg(7)
-        pshufb          xmm2, xmm0
-
-        ; make 16 copies of the center v value
-        movd            xmm3, arg(8)
-        pshufb          xmm3, xmm0
-        unpcklpd        xmm2, xmm3
-
-        ;make 16 copies of the y tolerance
-        movd            xmm3, arg(9)
-        pshufb          xmm3, xmm0
-
-        ;make 16 copies of the u tolerance
-        movd            xmm4, arg(10)
-        pshufb          xmm4, xmm0
-
-        ;make 16 copies of the v tolerance
-        movd            xmm5, arg(11)
-        pshufb          xmm5, xmm0
-        unpckhpd        xmm4, xmm5
-
-        mov             r8,8
-
-NextPairOfRows:
-
-        ;grab the y source values
-        movdqu          xmm0, [rsi]
-
-        ;compute abs difference between source and y target
-        movdqa          xmm6, xmm1
-        movdqa          xmm7, xmm0
-        psubusb         xmm0, xmm1
-        psubusb         xmm6, xmm7
-        por             xmm0, xmm6
-
-        ;compute abs difference between
-        movdqa          xmm6, xmm3
-        pcmpgtb         xmm6, xmm0
-
-        ;grab the y source values
-        add             rsi, rbx
-        movdqu          xmm0, [rsi]
-
-        ;compute abs difference between source and y target
-        movdqa          xmm11, xmm1
-        movdqa          xmm7, xmm0
-        psubusb         xmm0, xmm1
-        psubusb         xmm11, xmm7
-        por             xmm0, xmm11
-
-        ;compute abs difference between
-        movdqa          xmm11, xmm3
-        pcmpgtb         xmm11, xmm0
-
-
-        ;grab the u and v source values
-        movdqu          xmm7, [rdi]
-        movdqu          xmm8, [rcx]
-        unpcklpd        xmm7, xmm8
-
-        ;compute abs difference between source and uv targets
-        movdqa          xmm9, xmm2
-        movdqa          xmm10, xmm7
-        psubusb         xmm7, xmm2
-        psubusb         xmm9, xmm10
-        por             xmm7, xmm9
-
-        ;check whether the number is < tolerance
-        movdqa          xmm0, xmm4
-        pcmpgtb         xmm0, xmm7
-
-        ;double  u and v masks
-        movdqa          xmm8, xmm0
-        punpckhbw       xmm0, xmm0
-        punpcklbw       xmm8, xmm8
-
-        ;mask row 0 and output
-        pand            xmm6, xmm8
-        pand            xmm6, xmm0
-        movdqa          [rax],xmm6
-
-        ;mask row 1 and output
-        pand            xmm11, xmm8
-        pand            xmm11, xmm0
-        movdqa          [rax+16],xmm11
-
-
-        ; to the next row or set of rows
-        add             rsi, rbx
-        add             rdi, rdx
-        add             rcx, rdx
-        add             rax,32
-        dec r8
-        jnz NextPairOfRows
-
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;GROW_HORIZ (register for result, source register or mem local)
-; takes source and shifts left and ors with source
-; then shifts right and ors with source
-%macro GROW_HORIZ 2
-    movdqa          %1, %2
-    movdqa          xmm14, %1
-    movdqa          xmm15, %1
-    pslldq          xmm14, 1
-    psrldq          xmm15, 1
-    por             %1,xmm14
-    por             %1,xmm15
-%endmacro
-;GROW_VERT (result, center row, above row, below row)
-%macro GROW_VERT 4
-    movdqa          %1,%2
-    por             %1,%3
-    por             %1,%4
-%endmacro
-
-;GROW_NEXTLINE (new line to grow, new source, line to write)
-%macro GROW_NEXTLINE 3
-    GROW_HORIZ %1, %2
-    GROW_VERT xmm3, xmm0, xmm1, xmm2
-    movdqa %3,xmm3
-%endmacro
-
-
-;void int vp8_growmaskmb_sse3(
-;    unsigned char *om,
-;    unsigned char *nm,
-global sym(vp8_growmaskmb_sse3) PRIVATE
-sym(vp8_growmaskmb_sse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;src
-    mov             rdi,        arg(1) ;rst
-
-    GROW_HORIZ xmm0, [rsi]
-    GROW_HORIZ xmm1, [rsi+16]
-    GROW_HORIZ xmm2, [rsi+32]
-
-    GROW_VERT xmm3, xmm0, xmm1, xmm2
-    por xmm0,xmm1
-    movdqa [rdi], xmm0
-    movdqa [rdi+16],xmm3
-
-    GROW_NEXTLINE xmm0,[rsi+48],[rdi+32]
-    GROW_NEXTLINE xmm1,[rsi+64],[rdi+48]
-    GROW_NEXTLINE xmm2,[rsi+80],[rdi+64]
-    GROW_NEXTLINE xmm0,[rsi+96],[rdi+80]
-    GROW_NEXTLINE xmm1,[rsi+112],[rdi+96]
-    GROW_NEXTLINE xmm2,[rsi+128],[rdi+112]
-    GROW_NEXTLINE xmm0,[rsi+144],[rdi+128]
-    GROW_NEXTLINE xmm1,[rsi+160],[rdi+144]
-    GROW_NEXTLINE xmm2,[rsi+176],[rdi+160]
-    GROW_NEXTLINE xmm0,[rsi+192],[rdi+176]
-    GROW_NEXTLINE xmm1,[rsi+208],[rdi+192]
-    GROW_NEXTLINE xmm2,[rsi+224],[rdi+208]
-    GROW_NEXTLINE xmm0,[rsi+240],[rdi+224]
-
-    por xmm0,xmm2
-    movdqa [rdi+240], xmm0
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;unsigned int vp8_sad16x16_masked_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned char *mask)
-global sym(vp8_sad16x16_masked_wmt) PRIVATE
-sym(vp8_sad16x16_masked_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-    mov             rsi,        arg(0) ;src_ptr
-    mov             rdi,        arg(2) ;ref_ptr
-
-    mov             rbx,        arg(4) ;mask
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    mov             rcx,        16
-
-    pxor            xmm3,       xmm3
-
-NextSadRow:
-    movdqu          xmm0,       [rsi]
-    movdqu          xmm1,       [rdi]
-    movdqu          xmm2,       [rbx]
-    pand            xmm0,       xmm2
-    pand            xmm1,       xmm2
-
-    psadbw          xmm0,       xmm1
-    paddw           xmm3,       xmm0
-
-    add             rsi, rax
-    add             rdi, rdx
-    add             rbx,  16
-
-    dec rcx
-    jnz NextSadRow
-
-    movdqa          xmm4 ,     xmm3
-    psrldq          xmm4,       8
-    paddw           xmm3,      xmm4
-    movq            rax,       xmm3
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_sad16x16_unmasked_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned char *mask)
-global sym(vp8_sad16x16_unmasked_wmt) PRIVATE
-sym(vp8_sad16x16_unmasked_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-    mov             rsi,        arg(0) ;src_ptr
-    mov             rdi,        arg(2) ;ref_ptr
-
-    mov             rbx,        arg(4) ;mask
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    mov             rcx,        16
-
-    pxor            xmm3,       xmm3
-
-next_vp8_sad16x16_unmasked_wmt:
-    movdqu          xmm0,       [rsi]
-    movdqu          xmm1,       [rdi]
-    movdqu          xmm2,       [rbx]
-    por             xmm0,       xmm2
-    por             xmm1,       xmm2
-
-    psadbw          xmm0,       xmm1
-    paddw           xmm3,       xmm0
-
-    add             rsi, rax
-    add             rdi, rdx
-    add             rbx,  16
-
-    dec rcx
-    jnz next_vp8_sad16x16_unmasked_wmt
-
-    movdqa          xmm4 ,     xmm3
-    psrldq          xmm4,       8
-    paddw           xmm3,      xmm4
-    movq            rax,        xmm3
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_masked_predictor_wmt(
-;    unsigned char *masked,
-;    unsigned char *unmasked,
-;    int  src_stride,
-;    unsigned char *dst_ptr,
-;    int  dst_stride,
-;    unsigned char *mask)
-global sym(vp8_masked_predictor_wmt) PRIVATE
-sym(vp8_masked_predictor_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-    mov             rsi,        arg(0) ;src_ptr
-    mov             rdi,        arg(1) ;ref_ptr
-
-    mov             rbx,        arg(5) ;mask
-    movsxd          rax,        dword ptr arg(2) ;src_stride
-    mov             r11,        arg(3) ; destination
-    movsxd          rdx,        dword ptr arg(4) ;dst_stride
-
-    mov             rcx,        16
-
-    pxor            xmm3,       xmm3
-
-next_vp8_masked_predictor_wmt:
-    movdqu          xmm0,       [rsi]
-    movdqu          xmm1,       [rdi]
-    movdqu          xmm2,       [rbx]
-
-    pand            xmm0,       xmm2
-    pandn           xmm2,       xmm1
-    por             xmm0,       xmm2
-    movdqu          [r11],      xmm0
-
-    add             r11, rdx
-    add             rsi, rax
-    add             rdi, rdx
-    add             rbx,  16
-
-    dec rcx
-    jnz next_vp8_masked_predictor_wmt
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;unsigned int vp8_masked_predictor_uv_wmt(
-;    unsigned char *masked,
-;    unsigned char *unmasked,
-;    int  src_stride,
-;    unsigned char *dst_ptr,
-;    int  dst_stride,
-;    unsigned char *mask)
-global sym(vp8_masked_predictor_uv_wmt) PRIVATE
-sym(vp8_masked_predictor_uv_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-    mov             rsi,        arg(0) ;src_ptr
-    mov             rdi,        arg(1) ;ref_ptr
-
-    mov             rbx,        arg(5) ;mask
-    movsxd          rax,        dword ptr arg(2) ;src_stride
-    mov             r11,        arg(3) ; destination
-    movsxd          rdx,        dword ptr arg(4) ;dst_stride
-
-    mov             rcx,        8
-
-    pxor            xmm3,       xmm3
-
-next_vp8_masked_predictor_uv_wmt:
-    movq            xmm0,       [rsi]
-    movq            xmm1,       [rdi]
-    movq            xmm2,       [rbx]
-
-    pand            xmm0,       xmm2
-    pandn           xmm2,       xmm1
-    por             xmm0,       xmm2
-    movq            [r11],      xmm0
-
-    add             r11, rdx
-    add             rsi, rax
-    add             rdi, rax
-    add             rbx,  8
-
-    dec rcx
-    jnz next_vp8_masked_predictor_uv_wmt
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_uv_from_y_mask(
-;    unsigned char *ymask,
-;    unsigned char *uvmask)
-global sym(vp8_uv_from_y_mask) PRIVATE
-sym(vp8_uv_from_y_mask):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-    mov             rsi,        arg(0) ;src_ptr
-    mov             rdi,        arg(1) ;dst_ptr
-
-
-    mov             rcx,        8
-
-    pxor            xmm3,       xmm3
-
-next_p8_uv_from_y_mask:
-    movdqu          xmm0,       [rsi]
-    pshufb          xmm0, [shuf1b] ;[GLOBAL(shuf1b)]
-    movq            [rdi],xmm0
-    add             rdi, 8
-    add             rsi,32
-
-    dec rcx
-    jnz next_p8_uv_from_y_mask
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-shuf1b:
-    db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
-
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 6885c77..3a2990c 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -629,7 +629,7 @@
   int bsl = b_width_log2(bsize);
   int bs = (1 << bsl) / 4;  // mode_info step for subsize
   int n;
-  PARTITION_TYPE partition;
+  PARTITION_TYPE partition = PARTITION_NONE;
   BLOCK_SIZE_TYPE subsize;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
@@ -1196,6 +1196,55 @@
     }
 }
 
+static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
+  VP9_COMMON *const cm = &cpi->common;
+  vp9_writer residual_bc;
+
+  int tile_row, tile_col;
+  TOKENEXTRA *tok[4][1 << 6], *tok_end;
+  size_t total_size = 0;
+
+  vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
+             mi_cols_aligned_to_sb(cm->mi_cols));
+
+  tok[0][0] = cpi->tok;
+  for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+    if (tile_row)
+      tok[tile_row][0] = tok[tile_row - 1][cm->tile_columns - 1] +
+                         cpi->tok_count[tile_row - 1][cm->tile_columns - 1];
+
+    for (tile_col = 1; tile_col < cm->tile_columns; tile_col++)
+      tok[tile_row][tile_col] = tok[tile_row][tile_col - 1] +
+                                cpi->tok_count[tile_row][tile_col - 1];
+  }
+
+  for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+    vp9_get_tile_row_offsets(cm, tile_row);
+    for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
+      vp9_get_tile_col_offsets(cm, tile_col);
+      tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
+
+      if (tile_col < cm->tile_columns - 1 || tile_row < cm->tile_rows - 1)
+        vp9_start_encode(&residual_bc, data_ptr + total_size + 4);
+      else
+        vp9_start_encode(&residual_bc, data_ptr + total_size);
+
+      write_modes(cpi, &residual_bc, &tok[tile_row][tile_col], tok_end);
+      assert(tok[tile_row][tile_col] == tok_end);
+      vp9_stop_encode(&residual_bc);
+      if (tile_col < cm->tile_columns - 1 || tile_row < cm->tile_rows - 1) {
+        // size of this tile
+        write_be32(data_ptr + total_size, residual_bc.pos);
+        total_size += 4;
+      }
+
+      total_size += residual_bc.pos;
+    }
+  }
+
+  return total_size;
+}
+
 static void write_display_size(VP9_COMP *cpi, struct vp9_write_bit_buffer *wb) {
   VP9_COMMON *const cm = &cpi->common;
 
@@ -1339,7 +1388,7 @@
 void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
   int i, bytes_packed;
   VP9_COMMON *const pc = &cpi->common;
-  vp9_writer header_bc, residual_bc;
+  vp9_writer header_bc;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
   uint8_t *cx_data = dest;
@@ -1469,51 +1518,7 @@
   assert(header_bc.pos <= 0xffff);
   vp9_wb_write_literal(&first_partition_size_wb, header_bc.pos, 16);
   *size = bytes_packed + header_bc.pos;
-
-  {
-    int tile_row, tile_col, total_size = 0;
-    unsigned char *data_ptr = cx_data + header_bc.pos;
-    TOKENEXTRA *tok[4][1 << 6], *tok_end;
-
-    vpx_memset(pc->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
-               mi_cols_aligned_to_sb(pc->mi_cols));
-    tok[0][0] = cpi->tok;
-    for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
-      if (tile_row) {
-        tok[tile_row][0] = tok[tile_row - 1][pc->tile_columns - 1] +
-                           cpi->tok_count[tile_row - 1][pc->tile_columns - 1];
-      }
-      for (tile_col = 1; tile_col < pc->tile_columns; tile_col++) {
-        tok[tile_row][tile_col] = tok[tile_row][tile_col - 1] +
-                                  cpi->tok_count[tile_row][tile_col - 1];
-      }
-    }
-
-    for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
-      vp9_get_tile_row_offsets(pc, tile_row);
-      for (tile_col = 0; tile_col < pc->tile_columns; tile_col++) {
-        vp9_get_tile_col_offsets(pc, tile_col);
-        tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
-
-        if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1)
-          vp9_start_encode(&residual_bc, data_ptr + total_size + 4);
-        else
-          vp9_start_encode(&residual_bc, data_ptr + total_size);
-        write_modes(cpi, &residual_bc, &tok[tile_row][tile_col], tok_end);
-        assert(tok[tile_row][tile_col] == tok_end);
-        vp9_stop_encode(&residual_bc);
-        if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1) {
-          // size of this tile
-          write_be32(data_ptr + total_size, residual_bc.pos);
-          total_size += 4;
-        }
-
-        total_size += residual_bc.pos;
-      }
-    }
-
-    *size += total_size;
-  }
+  *size += encode_tiles(cpi, cx_data + header_bc.pos);
 }
 
 #ifdef ENTROPY_STATS
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index ce81e05..9c04d8a 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -324,9 +324,8 @@
   int mb_mode_index = ctx->best_mode_index;
   const int mis = cpi->common.mode_info_stride;
   const int bh = 1 << mi_height_log2(bsize), bw = 1 << mi_width_log2(bsize);
-  const MB_PREDICTION_MODE mb_mode = mi->mbmi.mode;
 
-  assert(mb_mode < MB_MODE_COUNT);
+  assert(mi->mbmi.mode < MB_MODE_COUNT);
   assert(mb_mode_index < MAX_MODES);
   assert(mi->mbmi.ref_frame[0] < MAX_REF_FRAMES);
   assert(mi->mbmi.ref_frame[1] < MAX_REF_FRAMES);
@@ -1170,7 +1169,7 @@
   int mh = bh / 2;
   int bss = (1 << bsl) / 4;
   int i, pl;
-  PARTITION_TYPE partition;
+  PARTITION_TYPE partition = PARTITION_NONE;
   BLOCK_SIZE_TYPE subsize;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
@@ -1421,6 +1420,8 @@
   int srate = INT_MAX;
   int64_t sdist = INT_MAX;
 
+  (void) *tp_orig;
+
   if (bsize < BLOCK_SIZE_SB8X8)
     if (xd->ab_index != 0) {
       *rate = 0;
@@ -1928,13 +1929,13 @@
   if (mbmi->txfm_size > txfm_max) {
     MACROBLOCK * const x = &cpi->mb;
     MACROBLOCKD * const xd = &x->e_mbd;
-    const int segment_id = mbmi->segment_id;
     const int ymbs = MIN(bh, cm->mi_rows - mi_row);
     const int xmbs = MIN(bw, cm->mi_cols - mi_col);
 
     xd->mode_info_context = mi;
     assert(
-        vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) || get_skip_flag(mi, mis, ymbs, xmbs));
+        vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP) ||
+        get_skip_flag(mi, mis, ymbs, xmbs));
     set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
   }
 }
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index a658b1b..cc3c5c0 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -719,6 +719,7 @@
   sf->last_partitioning_redo_frequency = 4;
   sf->disable_splitmv = 0;
   sf->mode_search_skip_flags = 0;
+  sf->last_chroma_intra_mode = TM_PRED;
 
   // Skip any mode not chosen at size < X for all sizes > X
   // Hence BLOCK_SIZE_SB64X64 (skip is off)
@@ -746,6 +747,7 @@
       sf->auto_mv_step_size = 1;
       sf->use_avoid_tested_higherror = 1;
       sf->adaptive_rd_thresh = 1;
+      sf->last_chroma_intra_mode = TM_PRED;
 
       if (speed == 1) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
@@ -764,6 +766,7 @@
         sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
                                      FLAG_SKIP_INTRA_BESTINTER |
                                      FLAG_SKIP_COMP_BESTINTRA;
+        sf->last_chroma_intra_mode = H_PRED;
       }
       if (speed == 2) {
         sf->adjust_thresholds_by_speed = 1;
@@ -786,6 +789,7 @@
                                      FLAG_SKIP_INTRA_BESTINTER |
                                      FLAG_SKIP_COMP_BESTINTRA |
                                      FLAG_SKIP_COMP_REFMISMATCH;
+        sf->last_chroma_intra_mode = DC_PRED;
       }
       if (speed == 3) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index d640da7..0ecbf35 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -274,6 +274,7 @@
   // The heuristics selected are based on  flags
   // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
   unsigned int mode_search_skip_flags;
+  MB_PREDICTION_MODE last_chroma_intra_mode;
 } SPEED_FEATURES;
 
 enum BlockSize {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 0d32832..6116f3f 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -478,8 +478,8 @@
                                  MACROBLOCK *x, MACROBLOCKD *xd,
                                  int *out_rate_sum, int64_t *out_dist_sum,
                                  int *out_skip) {
-  int t, j, k;
-  enum BlockSize bs;
+  int t = 4, j, k;
+  enum BlockSize bs = BLOCK_4X4;
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
   const int bw = plane_block_width(bsize, pd);
@@ -551,26 +551,25 @@
   int pt;
   int c = 0;
   int cost = 0;
-  const int16_t *scan, *nb;
+  const int16_t *scan = NULL, *nb;
   const int eob = xd->plane[plane].eobs[block];
   const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
   const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
   unsigned int (*token_costs)[COEF_BANDS][PREV_COEF_CONTEXTS]
                     [MAX_ENTROPY_TOKENS] = mb->token_costs[tx_size][type][ref];
-  ENTROPY_CONTEXT above_ec, left_ec;
+  ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
   TX_TYPE tx_type = DCT_DCT;
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  int seg_eob;
+  int seg_eob = 0;
   uint8_t token_cache[1024];
-  const uint8_t * band_translate;
+  const uint8_t *band_translate = NULL;
 
   // Check for consistency of tx_size with mode info
   assert((!type && !plane) || (type && plane));
   if (type == PLANE_TYPE_Y_WITH_DC) {
     assert(xd->mode_info_context->mbmi.txfm_size == tx_size);
   } else {
-    TX_SIZE tx_size_uv = get_uv_tx_size(mbmi);
-    assert(tx_size == tx_size_uv);
+    assert(tx_size == get_uv_tx_size(mbmi));
   }
 
   switch (tx_size) {
@@ -1500,12 +1499,16 @@
                                        int64_t *distortion, int *skippable,
                                        BLOCK_SIZE_TYPE bsize) {
   MB_PREDICTION_MODE mode;
+  MB_PREDICTION_MODE last_mode;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
   int64_t best_rd = INT64_MAX, this_rd;
   int this_rate_tokenonly, this_rate, s;
   int64_t this_distortion;
 
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+  last_mode = bsize <= BLOCK_SIZE_SB8X8 ?
+              TM_PRED : cpi->sf.last_chroma_intra_mode;
+
+  for (mode = DC_PRED; mode <= last_mode; mode++) {
     x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
     super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
                      &this_distortion, &s, NULL, bsize);
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index 87a774c..bf09c7a 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -557,6 +557,8 @@
 
 // right shift and rounding
 static INLINE void right_shift_8x8(__m128i *res, int const bit) {
+  const __m128i kOne = _mm_set1_epi16(1);
+  const int bit_m02 = bit - 2;
   __m128i sign0 = _mm_srai_epi16(res[0], 15);
   __m128i sign1 = _mm_srai_epi16(res[1], 15);
   __m128i sign2 = _mm_srai_epi16(res[2], 15);
@@ -566,6 +568,18 @@
   __m128i sign6 = _mm_srai_epi16(res[6], 15);
   __m128i sign7 = _mm_srai_epi16(res[7], 15);
 
+  if (bit_m02 >= 0) {
+    __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
+    res[0] = _mm_add_epi16(res[0], k_const_rounding);
+    res[1] = _mm_add_epi16(res[1], k_const_rounding);
+    res[2] = _mm_add_epi16(res[2], k_const_rounding);
+    res[3] = _mm_add_epi16(res[3], k_const_rounding);
+    res[4] = _mm_add_epi16(res[4], k_const_rounding);
+    res[5] = _mm_add_epi16(res[5], k_const_rounding);
+    res[6] = _mm_add_epi16(res[6], k_const_rounding);
+    res[7] = _mm_add_epi16(res[7], k_const_rounding);
+  }
+
   res[0] = _mm_sub_epi16(res[0], sign0);
   res[1] = _mm_sub_epi16(res[1], sign1);
   res[2] = _mm_sub_epi16(res[2], sign2);
@@ -587,8 +601,6 @@
 
 // write 8x8 array
 static INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) {
-  right_shift_8x8(res, 1);
-
   _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
   _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
   _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
@@ -1046,6 +1058,7 @@
       assert(0);
       break;
   }
+  right_shift_8x8(in, 1);
   write_buffer_8x8(output, in, 8);
 }
 
@@ -1675,6 +1688,890 @@
   }
 }
 
+static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0,
+                                     __m128i *in1, int stride) {
+  // load first 8 columns
+  load_buffer_8x8(input, in0, stride);
+  load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
+
+  input += 8;
+  // load second 8 columns
+  load_buffer_8x8(input, in1, stride);
+  load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void write_buffer_16x16(int16_t *output, __m128i *in0,
+                                      __m128i *in1, int stride) {
+  // write first 8 columns
+  write_buffer_8x8(output, in0, stride);
+  write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
+  // write second 8 columns
+  output += 8;
+  write_buffer_8x8(output, in1, stride);
+  write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
+  __m128i tbuf[8];
+  array_transpose_8x8(res0, res0);
+  array_transpose_8x8(res1, tbuf);
+  array_transpose_8x8(res0 + 8, res1);
+  array_transpose_8x8(res1 + 8, res1 + 8);
+
+  res0[8] = tbuf[0];
+  res0[9] = tbuf[1];
+  res0[10] = tbuf[2];
+  res0[11] = tbuf[3];
+  res0[12] = tbuf[4];
+  res0[13] = tbuf[5];
+  res0[14] = tbuf[6];
+  res0[15] = tbuf[7];
+}
+
+static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
+  // perform rounding operations
+  right_shift_8x8(res0, 2);
+  right_shift_8x8(res0 + 8, 2);
+  right_shift_8x8(res1, 2);
+  right_shift_8x8(res1 + 8, 2);
+}
+
+void fdct16_1d_8col(__m128i *in) {
+  // perform 16x16 1-D DCT for 8 columns
+  __m128i i[8], s[8], p[8], t[8], u[16], v[16];
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  // stage 1
+  i[0] = _mm_add_epi16(in[0], in[15]);
+  i[1] = _mm_add_epi16(in[1], in[14]);
+  i[2] = _mm_add_epi16(in[2], in[13]);
+  i[3] = _mm_add_epi16(in[3], in[12]);
+  i[4] = _mm_add_epi16(in[4], in[11]);
+  i[5] = _mm_add_epi16(in[5], in[10]);
+  i[6] = _mm_add_epi16(in[6], in[9]);
+  i[7] = _mm_add_epi16(in[7], in[8]);
+
+  s[0] = _mm_sub_epi16(in[7], in[8]);
+  s[1] = _mm_sub_epi16(in[6], in[9]);
+  s[2] = _mm_sub_epi16(in[5], in[10]);
+  s[3] = _mm_sub_epi16(in[4], in[11]);
+  s[4] = _mm_sub_epi16(in[3], in[12]);
+  s[5] = _mm_sub_epi16(in[2], in[13]);
+  s[6] = _mm_sub_epi16(in[1], in[14]);
+  s[7] = _mm_sub_epi16(in[0], in[15]);
+
+  p[0] = _mm_add_epi16(i[0], i[7]);
+  p[1] = _mm_add_epi16(i[1], i[6]);
+  p[2] = _mm_add_epi16(i[2], i[5]);
+  p[3] = _mm_add_epi16(i[3], i[4]);
+  p[4] = _mm_sub_epi16(i[3], i[4]);
+  p[5] = _mm_sub_epi16(i[2], i[5]);
+  p[6] = _mm_sub_epi16(i[1], i[6]);
+  p[7] = _mm_sub_epi16(i[0], i[7]);
+
+  u[0] = _mm_add_epi16(p[0], p[3]);
+  u[1] = _mm_add_epi16(p[1], p[2]);
+  u[2] = _mm_sub_epi16(p[1], p[2]);
+  u[3] = _mm_sub_epi16(p[0], p[3]);
+
+  v[0] = _mm_unpacklo_epi16(u[0], u[1]);
+  v[1] = _mm_unpackhi_epi16(u[0], u[1]);
+  v[2] = _mm_unpacklo_epi16(u[2], u[3]);
+  v[3] = _mm_unpackhi_epi16(u[2], u[3]);
+
+  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
+  u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
+  u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
+  u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
+  u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
+  u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
+  u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
+  u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[4] = _mm_packs_epi32(u[4], u[5]);
+  in[8] = _mm_packs_epi32(u[2], u[3]);
+  in[12] = _mm_packs_epi32(u[6], u[7]);
+
+  u[0] = _mm_unpacklo_epi16(p[5], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[5], p[6]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+  u[0] = _mm_packs_epi32(v[0], v[1]);
+  u[1] = _mm_packs_epi32(v[2], v[3]);
+
+  t[0] = _mm_add_epi16(p[4], u[0]);
+  t[1] = _mm_sub_epi16(p[4], u[0]);
+  t[2] = _mm_sub_epi16(p[7], u[1]);
+  t[3] = _mm_add_epi16(p[7], u[1]);
+
+  u[0] = _mm_unpacklo_epi16(t[0], t[3]);
+  u[1] = _mm_unpackhi_epi16(t[0], t[3]);
+  u[2] = _mm_unpacklo_epi16(t[1], t[2]);
+  u[3] = _mm_unpackhi_epi16(t[1], t[2]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  in[2] = _mm_packs_epi32(v[0], v[1]);
+  in[6] = _mm_packs_epi32(v[4], v[5]);
+  in[10] = _mm_packs_epi32(v[2], v[3]);
+  in[14] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[2] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[3] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[2] = _mm_packs_epi32(v[0], v[1]);
+  t[3] = _mm_packs_epi32(v[2], v[3]);
+  t[4] = _mm_packs_epi32(v[4], v[5]);
+  t[5] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 3
+  p[0] = _mm_add_epi16(s[0], t[3]);
+  p[1] = _mm_add_epi16(s[1], t[2]);
+  p[2] = _mm_sub_epi16(s[1], t[2]);
+  p[3] = _mm_sub_epi16(s[0], t[3]);
+  p[4] = _mm_sub_epi16(s[7], t[4]);
+  p[5] = _mm_sub_epi16(s[6], t[5]);
+  p[6] = _mm_add_epi16(s[6], t[5]);
+  p[7] = _mm_add_epi16(s[7], t[4]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(p[1], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[1], p[6]);
+  u[2] = _mm_unpacklo_epi16(p[2], p[5]);
+  u[3] = _mm_unpackhi_epi16(p[2], p[5]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[1] = _mm_packs_epi32(v[0], v[1]);
+  t[2] = _mm_packs_epi32(v[2], v[3]);
+  t[5] = _mm_packs_epi32(v[4], v[5]);
+  t[6] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 5
+  s[0] = _mm_add_epi16(p[0], t[1]);
+  s[1] = _mm_sub_epi16(p[0], t[1]);
+  s[2] = _mm_sub_epi16(p[3], t[2]);
+  s[3] = _mm_add_epi16(p[3], t[2]);
+  s[4] = _mm_add_epi16(p[4], t[5]);
+  s[5] = _mm_sub_epi16(p[4], t[5]);
+  s[6] = _mm_sub_epi16(p[7], t[6]);
+  s[7] = _mm_add_epi16(p[7], t[6]);
+
+  // stage 6
+  u[0] = _mm_unpacklo_epi16(s[0], s[7]);
+  u[1] = _mm_unpackhi_epi16(s[0], s[7]);
+  u[2] = _mm_unpacklo_epi16(s[1], s[6]);
+  u[3] = _mm_unpackhi_epi16(s[1], s[6]);
+  u[4] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[5] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[6] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[7] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
+  v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
+  v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
+  v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
+  v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
+  v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
+  v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
+  v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
+  v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
+  v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
+  v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[1]  = _mm_packs_epi32(v[0], v[1]);
+  in[9]  = _mm_packs_epi32(v[2], v[3]);
+  in[5]  = _mm_packs_epi32(v[4], v[5]);
+  in[13] = _mm_packs_epi32(v[6], v[7]);
+  in[3]  = _mm_packs_epi32(v[8], v[9]);
+  in[11] = _mm_packs_epi32(v[10], v[11]);
+  in[7]  = _mm_packs_epi32(v[12], v[13]);
+  in[15] = _mm_packs_epi32(v[14], v[15]);
+}
+
+void fadst16_1d_8col(__m128i *in) {
+  // perform 16x16 1-D ADST for 8 columns
+  __m128i s[16], x[16], u[32], v[32];
+  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kZero = _mm_set1_epi16(0);
+
+  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+  u[0] = _mm_add_epi32(v[0], v[16]);
+  u[1] = _mm_add_epi32(v[1], v[17]);
+  u[2] = _mm_add_epi32(v[2], v[18]);
+  u[3] = _mm_add_epi32(v[3], v[19]);
+  u[4] = _mm_add_epi32(v[4], v[20]);
+  u[5] = _mm_add_epi32(v[5], v[21]);
+  u[6] = _mm_add_epi32(v[6], v[22]);
+  u[7] = _mm_add_epi32(v[7], v[23]);
+  u[8] = _mm_add_epi32(v[8], v[24]);
+  u[9] = _mm_add_epi32(v[9], v[25]);
+  u[10] = _mm_add_epi32(v[10], v[26]);
+  u[11] = _mm_add_epi32(v[11], v[27]);
+  u[12] = _mm_add_epi32(v[12], v[28]);
+  u[13] = _mm_add_epi32(v[13], v[29]);
+  u[14] = _mm_add_epi32(v[14], v[30]);
+  u[15] = _mm_add_epi32(v[15], v[31]);
+  u[16] = _mm_sub_epi32(v[0], v[16]);
+  u[17] = _mm_sub_epi32(v[1], v[17]);
+  u[18] = _mm_sub_epi32(v[2], v[18]);
+  u[19] = _mm_sub_epi32(v[3], v[19]);
+  u[20] = _mm_sub_epi32(v[4], v[20]);
+  u[21] = _mm_sub_epi32(v[5], v[21]);
+  u[22] = _mm_sub_epi32(v[6], v[22]);
+  u[23] = _mm_sub_epi32(v[7], v[23]);
+  u[24] = _mm_sub_epi32(v[8], v[24]);
+  u[25] = _mm_sub_epi32(v[9], v[25]);
+  u[26] = _mm_sub_epi32(v[10], v[26]);
+  u[27] = _mm_sub_epi32(v[11], v[27]);
+  u[28] = _mm_sub_epi32(v[12], v[28]);
+  u[29] = _mm_sub_epi32(v[13], v[29]);
+  u[30] = _mm_sub_epi32(v[14], v[30]);
+  u[31] = _mm_sub_epi32(v[15], v[31]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+  s[0] = _mm_packs_epi32(u[0], u[1]);
+  s[1] = _mm_packs_epi32(u[2], u[3]);
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_packs_epi32(u[8], u[9]);
+  s[5] = _mm_packs_epi32(u[10], u[11]);
+  s[6] = _mm_packs_epi32(u[12], u[13]);
+  s[7] = _mm_packs_epi32(u[14], u[15]);
+  s[8] = _mm_packs_epi32(u[16], u[17]);
+  s[9] = _mm_packs_epi32(u[18], u[19]);
+  s[10] = _mm_packs_epi32(u[20], u[21]);
+  s[11] = _mm_packs_epi32(u[22], u[23]);
+  s[12] = _mm_packs_epi32(u[24], u[25]);
+  s[13] = _mm_packs_epi32(u[26], u[27]);
+  s[14] = _mm_packs_epi32(u[28], u[29]);
+  s[15] = _mm_packs_epi32(u[30], u[31]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], v[8]);
+  u[1] = _mm_add_epi32(v[1], v[9]);
+  u[2] = _mm_add_epi32(v[2], v[10]);
+  u[3] = _mm_add_epi32(v[3], v[11]);
+  u[4] = _mm_add_epi32(v[4], v[12]);
+  u[5] = _mm_add_epi32(v[5], v[13]);
+  u[6] = _mm_add_epi32(v[6], v[14]);
+  u[7] = _mm_add_epi32(v[7], v[15]);
+  u[8] = _mm_sub_epi32(v[0], v[8]);
+  u[9] = _mm_sub_epi32(v[1], v[9]);
+  u[10] = _mm_sub_epi32(v[2], v[10]);
+  u[11] = _mm_sub_epi32(v[3], v[11]);
+  u[12] = _mm_sub_epi32(v[4], v[12]);
+  u[13] = _mm_sub_epi32(v[5], v[13]);
+  u[14] = _mm_sub_epi32(v[6], v[14]);
+  u[15] = _mm_sub_epi32(v[7], v[15]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+  x[0] = _mm_add_epi16(s[0], s[4]);
+  x[1] = _mm_add_epi16(s[1], s[5]);
+  x[2] = _mm_add_epi16(s[2], s[6]);
+  x[3] = _mm_add_epi16(s[3], s[7]);
+  x[4] = _mm_sub_epi16(s[0], s[4]);
+  x[5] = _mm_sub_epi16(s[1], s[5]);
+  x[6] = _mm_sub_epi16(s[2], s[6]);
+  x[7] = _mm_sub_epi16(s[3], s[7]);
+  x[8] = _mm_packs_epi32(u[0], u[1]);
+  x[9] = _mm_packs_epi32(u[2], u[3]);
+  x[10] = _mm_packs_epi32(u[4], u[5]);
+  x[11] = _mm_packs_epi32(u[6], u[7]);
+  x[12] = _mm_packs_epi32(u[8], u[9]);
+  x[13] = _mm_packs_epi32(u[10], u[11]);
+  x[14] = _mm_packs_epi32(u[12], u[13]);
+  x[15] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], v[4]);
+  u[1] = _mm_add_epi32(v[1], v[5]);
+  u[2] = _mm_add_epi32(v[2], v[6]);
+  u[3] = _mm_add_epi32(v[3], v[7]);
+  u[4] = _mm_sub_epi32(v[0], v[4]);
+  u[5] = _mm_sub_epi32(v[1], v[5]);
+  u[6] = _mm_sub_epi32(v[2], v[6]);
+  u[7] = _mm_sub_epi32(v[3], v[7]);
+  u[8] = _mm_add_epi32(v[8], v[12]);
+  u[9] = _mm_add_epi32(v[9], v[13]);
+  u[10] = _mm_add_epi32(v[10], v[14]);
+  u[11] = _mm_add_epi32(v[11], v[15]);
+  u[12] = _mm_sub_epi32(v[8], v[12]);
+  u[13] = _mm_sub_epi32(v[9], v[13]);
+  u[14] = _mm_sub_epi32(v[10], v[14]);
+  u[15] = _mm_sub_epi32(v[11], v[15]);
+
+  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[0] = _mm_add_epi16(x[0], x[2]);
+  s[1] = _mm_add_epi16(x[1], x[3]);
+  s[2] = _mm_sub_epi16(x[0], x[2]);
+  s[3] = _mm_sub_epi16(x[1], x[3]);
+  s[4] = _mm_packs_epi32(v[0], v[1]);
+  s[5] = _mm_packs_epi32(v[2], v[3]);
+  s[6] = _mm_packs_epi32(v[4], v[5]);
+  s[7] = _mm_packs_epi32(v[6], v[7]);
+  s[8] = _mm_add_epi16(x[8], x[10]);
+  s[9] = _mm_add_epi16(x[9], x[11]);
+  s[10] = _mm_sub_epi16(x[8], x[10]);
+  s[11] = _mm_sub_epi16(x[9], x[11]);
+  s[12] = _mm_packs_epi32(v[8], v[9]);
+  s[13] = _mm_packs_epi32(v[10], v[11]);
+  s[14] = _mm_packs_epi32(v[12], v[13]);
+  s[15] = _mm_packs_epi32(v[14], v[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[0] = s[0];
+  in[1] = _mm_sub_epi16(kZero, s[8]);
+  in[2] = s[12];
+  in[3] = _mm_sub_epi16(kZero, s[4]);
+  in[4] = _mm_packs_epi32(v[4], v[5]);
+  in[5] = _mm_packs_epi32(v[12], v[13]);
+  in[6] = _mm_packs_epi32(v[8], v[9]);
+  in[7] = _mm_packs_epi32(v[0], v[1]);
+  in[8] = _mm_packs_epi32(v[2], v[3]);
+  in[9] = _mm_packs_epi32(v[10], v[11]);
+  in[10] = _mm_packs_epi32(v[14], v[15]);
+  in[11] = _mm_packs_epi32(v[6], v[7]);
+  in[12] = s[5];
+  in[13] = _mm_sub_epi16(kZero, s[13]);
+  in[14] = s[9];
+  in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+void fdct16_1d_sse2(__m128i *in0, __m128i *in1) {
+  fdct16_1d_8col(in0);
+  fdct16_1d_8col(in1);
+  array_transpose_16x16(in0, in1);
+}
+
+void fadst16_1d_sse2(__m128i *in0, __m128i *in1) {
+  fadst16_1d_8col(in0);
+  fadst16_1d_8col(in1);
+  array_transpose_16x16(in0, in1);
+}
+
+void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output,
+                             int stride, int tx_type) {
+  __m128i in0[16], in1[16];
+  load_buffer_16x16(input, in0, in1, stride);
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      fdct16_1d_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_1d_sse2(in0, in1);
+      break;
+    case 1:  // ADST_DCT
+      fadst16_1d_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_1d_sse2(in0, in1);
+      break;
+    case 2:  // DCT_ADST
+      fdct16_1d_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_1d_sse2(in0, in1);
+      break;
+    case 3:  // ADST_ADST
+      fadst16_1d_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_1d_sse2(in0, in1);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+  write_buffer_16x16(output, in0, in1, 16);
+}
+
 void vp9_short_fdct32x32_rd_sse2(int16_t *input,
                                  int16_t *output_org, int pitch) {
   // Calculate pre-multiplied strides
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index e17d4e3..95ea60b 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -89,12 +89,6 @@
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
 endif
 
-# common (c)
-ifeq ($(CONFIG_CSM),yes)
-VP9_COMMON_SRCS-yes += common/vp9_maskingmv.c
-VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm
-endif
-
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_sse2.c
 
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index e5b5089..34adb8e 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1032,6 +1032,7 @@
   {VP8E_SET_CQ_LEVEL,                 set_param},
   {VP8E_SET_MAX_INTRA_BITRATE_PCT,    set_param},
   {VP9E_SET_LOSSLESS,                 set_param},
+  {VP9E_SET_FRAME_PARALLEL_DECODING,  set_param},
   {VP9_GET_REFERENCE,                 get_reference},
   { -1, NULL},
 };