Merge "Moving vp9_kf_default_bmode_probs to vp9_entropymode.c."

diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 3538c7b..9dcc078 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc

@@ -44,10 +44,8 @@
 
 class FwdTrans4x4Test : public ::testing::TestWithParam<int> {
  public:
-  FwdTrans4x4Test() { SetUpTestTxfm(); }
-  ~FwdTrans4x4Test() {}
-
-  void SetUpTestTxfm() {
+  virtual ~FwdTrans4x4Test() {}
+  virtual void SetUp() {
     tx_type_ = GetParam();
     if (tx_type_ == 0) {
       fwd_txfm_ = fdct4x4;

diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index eeae208..50e2e9d 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc

@@ -51,10 +51,8 @@
 
 class FwdTrans8x8Test : public ::testing::TestWithParam<int> {
  public:
-  FwdTrans8x8Test() { SetUpTestTxfm(); }
-  ~FwdTrans8x8Test() {}
-
-  void SetUpTestTxfm() {
+  virtual ~FwdTrans8x8Test() {}
+  virtual void SetUp() {
     tx_type_ = GetParam();
     if (tx_type_ == 0) {
       fwd_txfm = fdct8x8;

diff --git a/vp8/common/alloccommon.c b/vp8/common/alloccommon.c
index 8af9e90..54afc13 100644
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c

@@ -173,7 +173,6 @@
     oci->use_bilinear_mc_filter = 0;
     oci->full_pixel = 0;
     oci->multi_token_partition = ONE_PARTITION;
-    oci->clr_type = REG_YUV;
     oci->clamp_type = RECON_CLAMP_REQUIRED;
 
     /* Initialize reference frame sign bias structure to defaults */

diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index 276dd72..a0c97a1 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h

@@ -72,7 +72,6 @@
     int horiz_scale;
     int vert_scale;
 
-    YUV_TYPE clr_type;
     CLAMP_TYPE  clamp_type;
 
     YV12_BUFFER_CONFIG *frame_to_show;
@@ -157,7 +156,6 @@
 
     unsigned int current_video_frame;
 
-    int near_boffset[3];
     int version;
 
     TOKEN_PARTITION multi_token_partition;
@@ -165,8 +163,10 @@
 #ifdef PACKET_TESTING
     VP8_HEADER oh;
 #endif
+#if CONFIG_POSTPROC_VISUALIZER
     double bitrate;
     double framerate;
+#endif
 
 #if CONFIG_MULTITHREAD
     int processor_core_count;

diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 44c35ef..51eeb02 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c

@@ -1095,7 +1095,7 @@
         vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate bool decoder 0");
     if (pc->frame_type == KEY_FRAME) {
-        pc->clr_type    = (YUV_TYPE)vp8_read_bit(bc);
+        (void)vp8_read_bit(bc);  // colorspace
         pc->clamp_type  = (CLAMP_TYPE)vp8_read_bit(bc);
     }
 

diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 2db3096..2d9e343 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c

@@ -430,7 +430,6 @@
     *time_stamp = pbi->last_time_stamp;
     *time_end_stamp = 0;
 
-    sd->clrtype = pbi->common.clr_type;
 #if CONFIG_POSTPROC
     ret = vp8_post_proc_frame(&pbi->common, sd, flags);
 #else

diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 4707ae5..5f0c1f7 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c

@@ -1322,7 +1322,7 @@
         vp8_start_encode(bc, cx_data, cx_data_end);
 
         /* signal clr type */
-        vp8_write_bit(bc, pc->clr_type);
+        vp8_write_bit(bc, 0);
         vp8_write_bit(bc, pc->clamp_type);
 
     }

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 73f6583..11f1695 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c

@@ -4821,8 +4821,10 @@
 {
 #if HAVE_NEON
     int64_t store_reg[8];
-#endif
+#if CONFIG_RUNTIME_CPU_DETECT
     VP8_COMMON            *cm = &cpi->common;
+#endif
+#endif
     struct vpx_usec_timer  timer;
     int                    res = 0;
 
@@ -4848,7 +4850,6 @@
     if(vp8_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
                           frame_flags, cpi->active_map_enabled ? cpi->active_map : NULL))
         res = -1;
-    cm->clr_type = sd->clrtype;
     vpx_usec_timer_mark(&timer);
     cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
 

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 4531d5a..9a7b9c5 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c

@@ -695,7 +695,6 @@
     yv12->uv_stride = img->stride[VPX_PLANE_U];
 
     yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
-    yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12);
     return res;
 }
 
@@ -1079,11 +1078,7 @@
         ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer;
         ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer;
 
-        if (sd.clrtype == REG_YUV)
-            ctx->preview_img.fmt = VPX_IMG_FMT_I420;
-        else
-            ctx->preview_img.fmt = VPX_IMG_FMT_VPXI420;
-
+        ctx->preview_img.fmt = VPX_IMG_FMT_I420;
         ctx->preview_img.x_chroma_shift = 1;
         ctx->preview_img.y_chroma_shift = 1;
 

diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index b552b84..871b8d3 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c

@@ -288,8 +288,7 @@
       * the Y, U, and V planes, nor other alignment adjustments that
       * might be representable by a YV12_BUFFER_CONFIG, so we just
       * initialize all the fields.*/
-    img->fmt = yv12->clrtype == REG_YUV ?
-        VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;
+    img->fmt = VPX_IMG_FMT_I420;
     img->w = yv12->y_stride;
     img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
     img->d_w = yv12->y_width;
@@ -721,8 +720,6 @@
     yv12->uv_stride = img->stride[VPX_PLANE_U];
 
     yv12->border  = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;
-    yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12);
-
     return res;
 }
 

diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 96b27bf..749efe2 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c

@@ -158,10 +158,6 @@
   if (!oci->above_context[0])
     goto fail;
 
-  for (i = 1; i < MAX_MB_PLANE; i++)
-    oci->above_context[i] =
-        oci->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols;
-
   oci->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
   if (!oci->above_seg_context)
     goto fail;
@@ -180,7 +176,6 @@
 
   oci->txfm_mode = ONLY_4X4;
   oci->comp_pred_mode = HYBRID_PREDICTION;
-  oci->clr_type = REG_YUV;
 
   // Initialize reference frame sign bias structure to defaults
   vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
@@ -197,9 +192,15 @@
 }
 
 void vp9_update_frame_size(VP9_COMMON *cm) {
+  int i, mi_cols;
   const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, LOG2_MI_SIZE);
   const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, LOG2_MI_SIZE);
 
   set_mb_mi(cm, aligned_width, aligned_height);
   setup_mi(cm);
+
+  mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  for (i = 1; i < MAX_MB_PLANE; i++)
+    cm->above_context[i] =
+        cm->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols;
 }

diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 2ca9898..f56586a 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h

@@ -258,9 +258,6 @@
   int mb_to_top_edge;
   int mb_to_bottom_edge;
 
-  unsigned int frames_since_golden;
-  unsigned int frames_till_alt_ref_frame;
-
   int lossless;
   /* Inverse transform function pointers. */
   void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride);

diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index ad4471a..1cee34a 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h

@@ -137,8 +137,6 @@
   int subsampling_x;
   int subsampling_y;
 
-  YUV_TYPE clr_type;
-
   YV12_BUFFER_CONFIG *frame_to_show;
 
   YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];
@@ -247,12 +245,8 @@
   unsigned int  frame_context_idx; /* Context to use/update */
 
   unsigned int current_video_frame;
-  int near_boffset[3];
   int version;
 
-  double bitrate;
-  double framerate;
-
 #if CONFIG_POSTPROC
   struct postproc_state  postproc_state;
 #endif

diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c
index 2b81c2e..48d86c5 100644
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c

@@ -12,6 +12,79 @@
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_seg_common.h"
 
+#if 1
+static const int16_t dc_qlookup[QINDEX_RANGE] = {
+     4,    8,    8,    9,   10,   11,   12,   12,
+    13,   14,   15,   16,   17,   18,   19,   19,
+    20,   21,   22,   23,   24,   25,   26,   26,
+    27,   28,   29,   30,   31,   32,   32,   33,
+    34,   35,   36,   37,   38,   38,   39,   40,
+    41,   42,   43,   43,   44,   45,   46,   47,
+    48,   48,   49,   50,   51,   52,   53,   53,
+    54,   55,   56,   57,   57,   58,   59,   60,
+    61,   62,   62,   63,   64,   65,   66,   66,
+    67,   68,   69,   70,   70,   71,   72,   73,
+    74,   74,   75,   76,   77,   78,   78,   79,
+    80,   81,   81,   82,   83,   84,   85,   85,
+    87,   88,   90,   92,   93,   95,   96,   98,
+    99,  101,  102,  104,  105,  107,  108,  110,
+   111,  113,  114,  116,  117,  118,  120,  121,
+   123,  125,  127,  129,  131,  134,  136,  138,
+   140,  142,  144,  146,  148,  150,  152,  154,
+   156,  158,  161,  164,  166,  169,  172,  174,
+   177,  180,  182,  185,  187,  190,  192,  195,
+   199,  202,  205,  208,  211,  214,  217,  220,
+   223,  226,  230,  233,  237,  240,  243,  247,
+   250,  253,  257,  261,  265,  269,  272,  276,
+   280,  284,  288,  292,  296,  300,  304,  309,
+   313,  317,  322,  326,  330,  335,  340,  344,
+   349,  354,  359,  364,  369,  374,  379,  384,
+   389,  395,  400,  406,  411,  417,  423,  429,
+   435,  441,  447,  454,  461,  467,  475,  482,
+   489,  497,  505,  513,  522,  530,  539,  549,
+   559,  569,  579,  590,  602,  614,  626,  640,
+   654,  668,  684,  700,  717,  736,  755,  775,
+   796,  819,  843,  869,  896,  925,  955,  988,
+  1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336,
+};
+
+static const int16_t ac_qlookup[QINDEX_RANGE] = {
+     4,    8,    9,   10,   11,   12,   13,   14,
+    15,   16,   17,   18,   19,   20,   21,   22,
+    23,   24,   25,   26,   27,   28,   29,   30,
+    31,   32,   33,   34,   35,   36,   37,   38,
+    39,   40,   41,   42,   43,   44,   45,   46,
+    47,   48,   49,   50,   51,   52,   53,   54,
+    55,   56,   57,   58,   59,   60,   61,   62,
+    63,   64,   65,   66,   67,   68,   69,   70,
+    71,   72,   73,   74,   75,   76,   77,   78,
+    79,   80,   81,   82,   83,   84,   85,   86,
+    87,   88,   89,   90,   91,   92,   93,   94,
+    95,   96,   97,   98,   99,  100,  101,  102,
+   104,  106,  108,  110,  112,  114,  116,  118,
+   120,  122,  124,  126,  128,  130,  132,  134,
+   136,  138,  140,  142,  144,  146,  148,  150,
+   152,  155,  158,  161,  164,  167,  170,  173,
+   176,  179,  182,  185,  188,  191,  194,  197,
+   200,  203,  207,  211,  215,  219,  223,  227,
+   231,  235,  239,  243,  247,  251,  255,  260,
+   265,  270,  275,  280,  285,  290,  295,  300,
+   305,  311,  317,  323,  329,  335,  341,  347,
+   353,  359,  366,  373,  380,  387,  394,  401,
+   408,  416,  424,  432,  440,  448,  456,  465,
+   474,  483,  492,  501,  510,  520,  530,  540,
+   550,  560,  571,  582,  593,  604,  615,  627,
+   639,  651,  663,  676,  689,  702,  715,  729,
+   743,  757,  771,  786,  801,  816,  832,  848,
+   864,  881,  898,  915,  933,  951,  969,  988,
+  1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151,
+  1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343,
+  1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567,
+  1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
+};
+
+void vp9_init_quant_tables(void) { }
+#else
 static int16_t dc_qlookup[QINDEX_RANGE];
 static int16_t ac_qlookup[QINDEX_RANGE];
 
@@ -46,6 +119,7 @@
                                                  0.5, ac_val));
   }
 }
+#endif
 
 int16_t vp9_dc_quant(int qindex, int delta) {
   return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];

diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 8bab9f0..82e20aa 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -328,7 +328,7 @@
 specialize vp9_short_iht4x4_add sse2
 
 prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht8x8_add
+specialize vp9_short_iht8x8_add sse2
 
 prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
 specialize vp9_short_iht16x16_add

diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 4495b15..1bf869d 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c

@@ -638,6 +638,373 @@
   RECON_AND_STORE(dest, in7);
 }
 
+// perform 8x8 transpose
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+
+  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+}
+
+void idct8_1d_sse2(__m128i *in) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  in0 = in[0];
+  in1 = in[1];
+  in2 = in[2];
+  in3 = in[3];
+  in4 = in[4];
+  in5 = in[5];
+  in6 = in[6];
+  in7 = in[7];
+
+  // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2()
+  TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                in4, in5, in6, in7);
+
+  // 4-stage 1D idct8x8
+  IDCT8x8_1D
+  in[0] = in0;
+  in[1] = in1;
+  in[2] = in2;
+  in[3] = in3;
+  in[4] = in4;
+  in[5] = in5;
+  in[6] = in6;
+  in[7] = in7;
+}
+
+void iadst8_1d_sse2(__m128i *in) {
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__const_0 = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+  // transpose
+  array_transpose_8x8(in, in);
+
+  // properly aligned for butterfly input
+  in0  = in[7];
+  in1  = in[0];
+  in2  = in[5];
+  in3  = in[2];
+  in4  = in[3];
+  in5  = in[4];
+  in6  = in[1];
+  in7  = in[6];
+
+  // column transformation
+  // stage 1
+  // interleave and multiply/add into 32-bit integer
+  s0 = _mm_unpacklo_epi16(in0, in1);
+  s1 = _mm_unpackhi_epi16(in0, in1);
+  s2 = _mm_unpacklo_epi16(in2, in3);
+  s3 = _mm_unpackhi_epi16(in2, in3);
+  s4 = _mm_unpacklo_epi16(in4, in5);
+  s5 = _mm_unpackhi_epi16(in4, in5);
+  s6 = _mm_unpacklo_epi16(in6, in7);
+  s7 = _mm_unpackhi_epi16(in6, in7);
+
+  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+  // addition
+  w0 = _mm_add_epi32(u0, u8);
+  w1 = _mm_add_epi32(u1, u9);
+  w2 = _mm_add_epi32(u2, u10);
+  w3 = _mm_add_epi32(u3, u11);
+  w4 = _mm_add_epi32(u4, u12);
+  w5 = _mm_add_epi32(u5, u13);
+  w6 = _mm_add_epi32(u6, u14);
+  w7 = _mm_add_epi32(u7, u15);
+  w8 = _mm_sub_epi32(u0, u8);
+  w9 = _mm_sub_epi32(u1, u9);
+  w10 = _mm_sub_epi32(u2, u10);
+  w11 = _mm_sub_epi32(u3, u11);
+  w12 = _mm_sub_epi32(u4, u12);
+  w13 = _mm_sub_epi32(u5, u13);
+  w14 = _mm_sub_epi32(u6, u14);
+  w15 = _mm_sub_epi32(u7, u15);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+  // back to 16-bit and pack 8 integers into __m128i
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[1] = _mm_packs_epi32(u2, u3);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[3] = _mm_packs_epi32(u6, u7);
+  in[4] = _mm_packs_epi32(u8, u9);
+  in[5] = _mm_packs_epi32(u10, u11);
+  in[6] = _mm_packs_epi32(u12, u13);
+  in[7] = _mm_packs_epi32(u14, u15);
+
+  // stage 2
+  s0 = _mm_add_epi16(in[0], in[2]);
+  s1 = _mm_add_epi16(in[1], in[3]);
+  s2 = _mm_sub_epi16(in[0], in[2]);
+  s3 = _mm_sub_epi16(in[1], in[3]);
+  u0 = _mm_unpacklo_epi16(in[4], in[5]);
+  u1 = _mm_unpackhi_epi16(in[4], in[5]);
+  u2 = _mm_unpacklo_epi16(in[6], in[7]);
+  u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+  w0 = _mm_add_epi32(v0, v4);
+  w1 = _mm_add_epi32(v1, v5);
+  w2 = _mm_add_epi32(v2, v6);
+  w3 = _mm_add_epi32(v3, v7);
+  w4 = _mm_sub_epi32(v0, v4);
+  w5 = _mm_sub_epi32(v1, v5);
+  w6 = _mm_sub_epi32(v2, v6);
+  w7 = _mm_sub_epi32(v3, v7);
+
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  // back to 16-bit intergers
+  s4 = _mm_packs_epi32(u0, u1);
+  s5 = _mm_packs_epi32(u2, u3);
+  s6 = _mm_packs_epi32(u4, u5);
+  s7 = _mm_packs_epi32(u6, u7);
+
+  // stage 3
+  u0 = _mm_unpacklo_epi16(s2, s3);
+  u1 = _mm_unpackhi_epi16(s2, s3);
+  u2 = _mm_unpacklo_epi16(s6, s7);
+  u3 = _mm_unpackhi_epi16(s6, s7);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  s2 = _mm_packs_epi32(v0, v1);
+  s3 = _mm_packs_epi32(v2, v3);
+  s6 = _mm_packs_epi32(v4, v5);
+  s7 = _mm_packs_epi32(v6, v7);
+
+  in[0] = s0;
+  in[1] = _mm_sub_epi16(k__const_0, s4);
+  in[2] = s6;
+  in[3] = _mm_sub_epi16(k__const_0, s2);
+  in[4] = s3;
+  in[5] = _mm_sub_epi16(k__const_0, s7);
+  in[6] = s5;
+  in[7] = _mm_sub_epi16(k__const_0, s1);
+}
+
+
+void vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride,
+                               int tx_type) {
+  __m128i in[8];
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i final_rounding = _mm_set1_epi16(1<<4);
+
+  // load input data
+  in[0] = _mm_load_si128((__m128i *)input);
+  in[1] = _mm_load_si128((__m128i *)(input + 8 * 1));
+  in[2] = _mm_load_si128((__m128i *)(input + 8 * 2));
+  in[3] = _mm_load_si128((__m128i *)(input + 8 * 3));
+  in[4] = _mm_load_si128((__m128i *)(input + 8 * 4));
+  in[5] = _mm_load_si128((__m128i *)(input + 8 * 5));
+  in[6] = _mm_load_si128((__m128i *)(input + 8 * 6));
+  in[7] = _mm_load_si128((__m128i *)(input + 8 * 7));
+
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      idct8_1d_sse2(in);
+      idct8_1d_sse2(in);
+      break;
+    case 1:  // ADST_DCT
+      idct8_1d_sse2(in);
+      iadst8_1d_sse2(in);
+      break;
+    case 2:  // DCT_ADST
+      iadst8_1d_sse2(in);
+      idct8_1d_sse2(in);
+      break;
+    case 3:  // ADST_ADST
+      iadst8_1d_sse2(in);
+      iadst8_1d_sse2(in);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+  in[4] = _mm_adds_epi16(in[4], final_rounding);
+  in[5] = _mm_adds_epi16(in[5], final_rounding);
+  in[6] = _mm_adds_epi16(in[6], final_rounding);
+  in[7] = _mm_adds_epi16(in[7], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 5);
+  in[1] = _mm_srai_epi16(in[1], 5);
+  in[2] = _mm_srai_epi16(in[2], 5);
+  in[3] = _mm_srai_epi16(in[3], 5);
+  in[4] = _mm_srai_epi16(in[4], 5);
+  in[5] = _mm_srai_epi16(in[5], 5);
+  in[6] = _mm_srai_epi16(in[6], 5);
+  in[7] = _mm_srai_epi16(in[7], 5);
+
+  RECON_AND_STORE(dest, in[0]);
+  RECON_AND_STORE(dest, in[1]);
+  RECON_AND_STORE(dest, in[2]);
+  RECON_AND_STORE(dest, in[3]);
+  RECON_AND_STORE(dest, in[4]);
+  RECON_AND_STORE(dest, in[5]);
+  RECON_AND_STORE(dest, in[6]);
+  RECON_AND_STORE(dest, in[7]);
+}
+
 void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index b9c7f30..f734eae 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c

@@ -413,7 +413,6 @@
   *time_stamp = pbi->last_time_stamp;
   *time_end_stamp = 0;
 
-  sd->clrtype = pbi->common.clr_type;
 #if CONFIG_POSTPROC
   ret = vp9_post_proc_frame(&pbi->common, sd, flags);
 #else

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 8ac46c0..b49f9a3 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -1705,9 +1705,6 @@
   xd->mode_info_stride = cm->mode_info_stride;
   xd->frame_type = cm->frame_type;
 
-  xd->frames_since_golden = cm->frames_since_golden;
-  xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;
-
   // reset intra mode contexts
   if (cm->frame_type == KEY_FRAME)
     vp9_init_mbmode_probs(cm);

diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index cbb9f87..84879fc 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c

@@ -3505,7 +3505,6 @@
   if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags,
                          cpi->active_map_enabled ? cpi->active_map : NULL))
     res = -1;
-  cm->clr_type = sd->clrtype;
   vpx_usec_timer_mark(&timer);
   cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
 

diff --git a/vp9/vp9_iface_common.h b/vp9/vp9_iface_common.h
index 0c1f373..ed0122c 100644
--- a/vp9/vp9_iface_common.h
+++ b/vp9/vp9_iface_common.h

@@ -74,8 +74,6 @@
   yv12->alpha_stride = yv12->alpha_buffer ? img->stride[VPX_PLANE_ALPHA] : 0;
 
   yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
-  yv12->clrtype = REG_YUV;
-
 #if CONFIG_ALPHA
   // For development purposes, force alpha to hold the same data a Y for now.
   yv12->alpha_buffer = yv12->y_buffer;

diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h
index a919e49..66e587a 100644
--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h

@@ -22,24 +22,6 @@
 #define VP9BORDERINPIXELS      160
 #define VP9_INTERP_EXTEND        4
 
-  /*************************************
-   For INT_YUV:
-
-   Y = (R+G*2+B)/4;
-   U = (R-B)/2;
-   V =  (G*2 - R - B)/4;
-  And
-   R = Y+U-V;
-   G = Y+V;
-   B = Y-U-V;
-  ************************************/
-  typedef enum
-  {
-    REG_YUV = 0,    /* Regular yuv */
-    INT_YUV = 1     /* The type of yuv that can be tranfer to and from RGB through integer transform */
-  }
-            YUV_TYPE;
-
   typedef struct yv12_buffer_config {
     int   y_width;
     int   y_height;
@@ -68,7 +50,6 @@
     int buffer_alloc_sz;
     int border;
     int frame_size;
-    YUV_TYPE clrtype;
 
     int corrupted;
     int flags;