Merge remote branch 'internal/upstream-experimental' into HEAD

Conflicts:
	vp8/decoder/detokenize.c
	vp8/decoder/onyxd_if.c
	vp8/vp8_common.mk

Change-Id: Ifca1108186a8bc715da86a44021ee2fa5550b5b8
diff --git a/configure b/configure
index dfc3ffd..4b88f45 100755
--- a/configure
+++ b/configure
@@ -218,6 +218,9 @@
 "
 EXPERIMENT_LIST="
     extend_qrange
+    segmentation
+    t8x8
+    csm
 "
 CONFIG_LIST="
     external_build
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 9615523..f66e73c 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -20,6 +20,7 @@
 #include "treecoder.h"
 #include "subpixel.h"
 #include "vpx_ports/mem.h"
+#include "common.h"
 
 #define TRUE    1
 #define FALSE   0
@@ -29,6 +30,7 @@
 #define DCPREDCNTTHRESH 3
 
 #define MB_FEATURE_TREE_PROBS   3
+
 #define MAX_MB_SEGMENTS         4
 
 #define MAX_REF_LF_DELTAS       4
@@ -64,6 +66,10 @@
 #define VP8_COMBINEENTROPYCONTEXTS( Dest, A, B) \
     Dest = ((A)!=0) + ((B)!=0);
 
+#if CONFIG_T8X8
+#define VP8_COMBINEENTROPYCONTEXTS_8x8( Dest, A1, B1, A2, B2) \
+    Dest = ((A1)!=0 || (A2)!=0) + ((B1)!=0 || (B2)!=0);
+#endif
 
 typedef enum
 {
@@ -157,7 +163,9 @@
     MB_PREDICTION_MODE mode, uv_mode;
     MV_REFERENCE_FRAME ref_frame;
     int_mv mv;
-
+#if CONFIG_SEGMENTATION
+    unsigned char segment_flag;
+#endif
     unsigned char partitioning;
     unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
     unsigned char need_to_clamp_mvs;
@@ -232,9 +240,13 @@
 
     /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */
     /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */
-    vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];         /* Probability Tree used to code Segment number */
-
-    signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];            /* Segment parameters */
+#if CONFIG_SEGMENTATION
+    vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS + 3];         // Probability Tree used to code Segment number
+    unsigned char temporal_update;
+#else
+    vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];
+#endif
+    signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];            // Segment parameters
 
     /* mode_based Loop filter adjustment */
     unsigned char mode_ref_lf_delta_enabled;
diff --git a/vp8/common/coefupdateprobs.h b/vp8/common/coefupdateprobs.h
index 9e194dc..6fe5fcc 100644
--- a/vp8/common/coefupdateprobs.h
+++ b/vp8/common/coefupdateprobs.h
@@ -183,3 +183,180 @@
         },
     },
 };
+#if CONFIG_T8X8
+const vp8_prob vp8_coef_update_probs_8x8 [BLOCK_TYPES]
+                                         [COEF_BANDS]
+                                         [PREV_COEF_CONTEXTS]
+                                         [ENTROPY_NODES] =
+{
+    {
+      {
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 229, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {219, 234, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {239, 204, 229, 255, 255, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 209, 229, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {239, 219, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 204, 229, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {229, 209, 234, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 193, 209, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {229, 198, 239, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 204, 204, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {219, 198, 229, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 198, 204, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {209, 193, 234, 249, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 249, 255, 255, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 214, 214, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {173, 193, 234, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {249, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      },
+    },
+    {
+      {
+        {255, 255, 234, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {224, 224, 219, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {229, 239, 234, 255, 255, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 234, 224, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {224, 234, 234, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 255, 229, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {229, 255, 234, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 255, 229, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {224, 255, 239, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      },
+    },
+    {
+      {
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {224, 219, 234, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {234, 183, 214, 255, 255, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 193, 229, 255, 249, 255, 255, 255, 255, 255, 255, },
+        {229, 214, 234, 249, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 249, 255, 255, 249, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 198, 229, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {229, 219, 249, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 249, 255, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 193, 224, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {229, 204, 234, 249, 249, 255, 255, 255, 255, 255, 255, },
+        {255, 249, 249, 255, 244, 249, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 178, 224, 255, 249, 255, 255, 255, 255, 255, 255, },
+        {234, 224, 234, 249, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 183, 229, 255, 249, 255, 255, 255, 255, 255, 255, },
+        {234, 219, 234, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 249, 249, 255, 249, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 193, 224, 249, 255, 244, 255, 255, 255, 255, 255, },
+        {219, 224, 229, 255, 255, 249, 255, 255, 255, 255, 255, },
+        {255, 255, 255, 249, 249, 255, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 193, 229, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {224, 224, 239, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {249, 244, 249, 255, 255, 255, 255, 255, 255, 255, 255, },
+      },
+    },
+    {
+      {
+        {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {249, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        {255, 239, 234, 244, 239, 244, 249, 255, 255, 255, 255, },
+      },
+      {
+        {255, 249, 239, 239, 244, 255, 255, 255, 255, 255, 255, },
+        {255, 249, 244, 255, 249, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 239, 255, 255, 249, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 244, 239, 239, 244, 255, 255, 255, 255, 255, 255, },
+        {255, 234, 239, 234, 249, 255, 255, 255, 255, 255, 255, },
+        {255, 255, 229, 239, 234, 249, 244, 255, 255, 255, 255, },
+      },
+      {
+        {255, 239, 229, 239, 234, 234, 255, 255, 255, 255, 255, },
+        {255, 239, 234, 229, 244, 239, 255, 234, 255, 255, 255, },
+        {255, 229, 209, 229, 239, 234, 244, 229, 255, 249, 255, },
+      },
+      {
+        {255, 239, 234, 229, 244, 249, 255, 249, 255, 255, 255, },
+        {255, 234, 229, 244, 234, 249, 255, 249, 255, 255, 255, },
+        {255, 229, 239, 229, 249, 255, 255, 244, 255, 255, 255, },
+      },
+      {
+        {255, 239, 234, 239, 234, 239, 255, 249, 255, 255, 255, },
+        {255, 229, 234, 239, 239, 239, 255, 244, 255, 255, 255, },
+        {255, 229, 234, 239, 239, 244, 255, 255, 255, 255, 255, },
+      },
+      {
+        {255, 219, 224, 229, 229, 234, 239, 224, 255, 255, 255, },
+        {255, 229, 229, 224, 234, 229, 239, 239, 255, 255, 255, },
+        {255, 229, 224, 239, 234, 239, 224, 224, 255, 249, 255, },
+      },
+      {
+        {255, 234, 229, 244, 229, 229, 255, 214, 255, 255, 255, },
+        {255, 239, 234, 239, 214, 239, 255, 209, 255, 255, 255, },
+        {249, 239, 219, 209, 219, 224, 239, 204, 255, 255, 255, },
+      },
+    },
+
+};
+#endif
\ No newline at end of file
diff --git a/vp8/common/common.h b/vp8/common/common.h
index 9a93da9..999f79f 100644
--- a/vp8/common/common.h
+++ b/vp8/common/common.h
@@ -13,7 +13,7 @@
 #define common_h 1
 
 #include <assert.h>
-
+#include "vpx_config.h"
 /* Interface header for common constant data structures and lookup tables */
 
 #include "vpx_mem/vpx_mem.h"
@@ -38,5 +38,4 @@
 
 #define vp8_zero_array( Dest, N)  vpx_memset( Dest, 0, N * sizeof( *Dest));
 
-
 #endif  /* common_h */
diff --git a/vp8/common/defaultcoefcounts.c b/vp8/common/defaultcoefcounts.c
index b0e2e70..34d1fb1 100644
--- a/vp8/common/defaultcoefcounts.c
+++ b/vp8/common/defaultcoefcounts.c
@@ -223,3 +223,182 @@
         },
     },
 };
+
+
+#if CONFIG_T8X8
+const unsigned int vp8_default_coef_counts_8x8[BLOCK_TYPES]
+                                              [COEF_BANDS]
+                                              [PREV_COEF_CONTEXTS]
+                                              [MAX_ENTROPY_TOKENS] =
+{
+
+    { /* block Type 0 */
+      { /* Coeff Band 0 */
+        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+      },
+      { /* Coeff Band 1 */
+        { 21041, 13314, 3420, 592, 117, 0, 0, 0, 0, 0, 0, 11783},
+        { 48236, 6918, 586, 153, 0, 0, 0, 0, 0, 0, 0, 23137},
+        { 676112, 106685, 24701, 6003, 1426, 429, 165, 0, 0, 0, 0, 28910}
+      },
+      { /* Coeff Band 2 */
+        { 660107, 75227, 8451, 1345, 259, 0, 0, 0, 0, 0, 0, 0},
+        { 79164, 36835, 6865, 1185, 246, 47, 0, 0, 0, 0, 0, 2575},
+        { 19469, 14330, 3070, 579, 94, 6, 0, 0, 0, 0, 0, 44}
+      },
+      { /* Coeff Band 3 */
+        { 1978004, 235343, 28485, 3242, 271, 0, 0, 0, 0, 0, 0, 0},
+        { 228684, 106736, 21431, 2842, 272, 46, 0, 0, 0, 0, 0, 9266},
+        { 32470, 27496, 6852, 1386, 45, 93, 0, 0, 0, 0, 0, 0}
+      },
+      { /* Coeff Band 4 */
+        { 1911212, 224613, 49653, 13748, 2541, 568, 48, 0, 0, 0, 0, 0},
+        { 196670, 103472, 44473, 11490, 2432, 977, 72, 0, 0, 0, 0, 9447},
+        { 37876, 40417, 19142, 6069, 1799, 727, 51, 0, 0, 0, 0, 0}
+      },
+      { /* Coeff Band 5 */
+        { 3813399, 437714, 64387, 11312, 695, 219, 0, 0, 0, 0, 0, 0},
+        { 438288, 215917, 61905, 10194, 674, 107, 0, 0, 0, 0, 0, 17808},
+        { 99139, 93643, 30054, 5758, 802, 171, 0, 0, 0, 0, 0, 0}
+      },
+      { /* Coeff Band 6 */
+        { 12259383, 1625505, 234927, 46306, 8417, 1456, 151, 0, 0, 0, 0, 0},
+        { 1518161, 734287, 204240, 44228, 9462, 2240, 65, 0, 0, 0, 0, 107630},
+        { 292470, 258894, 94925, 25864, 6662, 2055, 170, 0, 0, 0, 0, 0}
+      },
+      { /* Coeff Band 7 */
+        { 9791308, 2118949, 169439, 16735, 1122, 0, 0, 0, 0, 0, 0, 0},
+        { 1500281, 752410, 123259, 13065, 1168, 47, 0, 0, 0, 0, 0, 707182},
+        { 193067, 142638, 31018, 4719, 516, 138, 0, 0, 0, 0, 0, 12439}
+      }
+    },
+    { /* block Type 1 */
+      { /* Coeff Band 0 */
+        { 16925, 10553, 852, 16, 63, 87, 47, 0, 0, 0, 0, 31232},
+        { 39777, 26839, 6822, 1908, 678, 456, 227, 168, 35, 0, 0, 46825},
+        { 17300, 16666, 4168, 1209, 492, 154, 118, 207, 0, 0, 0, 19608}
+      },
+      { /* Coeff Band 1 */
+        { 35882, 31722, 4625, 1270, 266, 237, 0, 0, 0, 0, 0, 0},
+        { 15426, 13894, 4482, 1305, 281, 43, 0, 0, 0, 0, 0, 18627},
+        { 3900, 6552, 3472, 1723, 746, 366, 115, 35, 0, 0, 0, 798}
+      },
+      { /* Coeff Band 2 */
+        { 21998, 29132, 3353, 679, 46, 0, 0, 0, 0, 0, 0, 0},
+        { 9098, 15767, 3794, 792, 268, 47, 0, 0, 0, 0, 0, 22402},
+        { 4007, 8472, 2844, 687, 217, 0, 0, 0, 0, 0, 0, 2739}
+      },
+      { /* Coeff Band 3 */
+        { 0, 31414, 2911, 682, 96, 0, 0, 0, 0, 0, 0, 0},
+        { 0, 16515, 4425, 938, 124, 0, 0, 0, 0, 0, 0, 31369},
+        { 0, 4833, 2787, 1213, 150, 0, 0, 0, 0, 0, 0, 3744}
+      },
+      { /* Coeff Band 4 */
+        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+      },
+      { /* Coeff Band 5 */
+        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+      },
+      { /* Coeff Band 6 */
+        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 52762},
+        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13326}
+      },
+      { /* Coeff Band 7 */
+        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+      }
+    },
+    { /* block Type 2 */
+      { /* Coeff Band 0 */
+        { 4444, 1614, 120, 48, 0, 48, 0, 0, 0, 0, 0, 278},
+        { 192436, 103730, 24494, 9845, 4122, 1193, 102, 0, 0, 0, 0, 2577},
+        { 3473446, 2308716, 815510, 370374, 167797, 92152, 12073, 86, 0, 0, 0, 6801}
+      },
+      { /* Coeff Band 1 */
+        { 2150616, 1136388, 250011, 86888, 31434, 13746, 1243, 0, 0, 0, 0, 0},
+        { 1179945, 799802, 266012, 106787, 40809, 16486, 1546, 0, 0, 0, 0, 2673},
+        { 465128, 504130, 286989, 146259, 62380, 30192, 2866, 20, 0, 0, 0, 0}
+      },
+      { /* Coeff Band 2 */
+        { 2157762, 1177519, 282665, 108499, 43389, 23224, 2597, 34, 0, 0, 0, 0},
+        { 1135685, 813705, 278079, 123255, 53935, 29492, 3152, 39, 0, 0, 0, 2978},
+        { 391894, 428037, 264216, 144306, 69326, 40281, 5541, 29, 0, 0, 0, 38}
+      },
+      { /* Coeff Band 3 */
+        { 6669109, 3468471, 782161, 288484, 115500, 51083, 4943, 41, 0, 0, 0, 0},
+        { 3454493, 2361636, 809524, 337663, 141343, 65036, 6361, 0, 0, 0, 0, 8730},
+        { 1231825, 1359522, 824686, 420784, 185517, 98731, 10973, 72, 0, 0, 0, 20}
+      },
+      { /* Coeff Band 4 */
+        { 7606203, 3452846, 659856, 191703, 49335, 14336, 450, 0, 0, 0, 0, 0},
+        { 3806506, 2379332, 691697, 224938, 61966, 18324, 766, 0, 0, 0, 0, 8193},
+        { 1270110, 1283728, 628775, 243378, 72617, 24897, 1087, 0, 0, 0, 0, 0}
+      },
+      { /* Coeff Band 5 */
+        { 15314169, 7436809, 1579928, 515790, 167453, 58305, 3502, 19, 0, 0, 0, 0},
+        { 7021286, 4667922, 1545706, 574463, 191793, 68748, 4048, 1, 0, 0, 0, 17222},
+        { 2011989, 2145878, 1185336, 534879, 195719, 79103, 5343, 4, 0, 0, 0, 37}
+      },
+      { /* Coeff Band 6 */
+        { 63458382, 25384462, 4208045, 1091050, 299011, 95242, 5238, 33, 0, 0, 0, 0},
+        { 25638401, 14694085, 3945978, 1195420, 344813, 117355, 6703, 0, 0, 0, 0, 216811},
+        { 5988177, 5824044, 2754413, 1077350, 370739, 139710, 9693, 38, 0, 0, 0, 1835}
+      },
+      { /* Coeff Band 7 */
+        { 74998348, 29342158, 2955001, 452912, 69631, 9516, 37, 0, 0, 0, 0, 0},
+        { 24762356, 13281085, 2409883, 436787, 68948, 10658, 36, 0, 0, 0, 0, 6614989},
+        { 3882867, 3224489, 1052289, 252890, 46967, 8548, 154, 0, 0, 0, 0, 194354}
+      }
+    },
+    { /* block Type 3 */
+      { /* Coeff Band 0 */
+        { 10583, 12059, 3155, 1041, 248, 175, 24, 2, 0, 0, 0, 5717},
+        { 42461, 41782, 13553, 4966, 1352, 855, 89, 0, 0, 0, 0, 15000},
+        { 4691125, 5045589, 2673566, 1089317, 378161, 160268, 18252, 813, 69, 13, 0, 49}
+      },
+      { /* Coeff Band 1 */
+        { 1535203, 1685686, 924565, 390329, 141709, 60523, 5983, 171, 0, 0, 0, 0},
+        { 1594021, 1793276, 1016078, 441332, 164159, 70843, 8098, 311, 0, 0, 0, 11312},
+        { 1225223, 1430184, 888492, 460713, 203286, 115149, 22061, 804, 7, 0, 0, 0}
+      },
+      { /* Coeff Band 2 */
+        { 1522386, 1590366, 799910, 303691, 96625, 37608, 3637, 180, 33, 11, 0, 0},
+        { 1682184, 1793869, 913649, 353520, 113674, 46309, 4736, 221, 18, 3, 0, 963},
+        { 1574580, 1740474, 954392, 417994, 151400, 67091, 8000, 536, 73, 10, 0, 63}
+      },
+      { /* Coeff Band 3 */
+        { 4963672, 5197790, 2585383, 982161, 313333, 118498, 16014, 536, 62, 0, 0, 0},
+        { 5223913, 5569803, 2845858, 1107384, 364949, 147841, 18296, 658, 11, 11, 0, 1866},
+        { 4042207, 4548894, 2608767, 1154993, 446290, 221295, 41054, 2438, 124, 20, 0, 0}
+      },
+      { /* Coeff Band 4 */
+        { 3857216, 4431325, 2670447, 1330169, 553301, 286825, 46763, 1917, 0, 0, 0, 0},
+        { 4226215, 4963701, 3046198, 1523923, 644670, 355519, 58792, 2525, 0, 0, 0, 1298},
+        { 3831873, 4580350, 3018580, 1660048, 797298, 502983, 123906, 7172, 16, 0, 0, 0}
+      },
+      { /* Coeff Band 5 */
+        { 8524543, 9285149, 4979435, 2039330, 683458, 266032, 22628, 270, 0, 0, 0, 0},
+        { 9432163, 10428088, 5715661, 2385738, 838389, 326264, 29981, 361, 0, 0, 0, 884},
+        { 9039066, 10368964, 6136765, 2862030, 1098269, 511668, 63105, 945, 14, 0, 0, 0}
+      },
+      { /* Coeff Band 6 */
+        { 33222872, 34748297, 17701695, 7214933, 2602336, 1191859, 187873, 12667, 390, 3, 0, 0},
+        { 34765051, 37140719, 19525578, 8268934, 3085012, 1473864, 246743, 15258, 736, 3, 0, 8403},
+        { 28591289, 32252393, 19037068, 9213729, 4020653, 2372354, 586420, 67428, 3920, 92, 7, 3}
+      },
+      { /* Coeff Band 7 */
+        { 68604786, 60777665, 19712887, 5656955, 1520443, 507166, 51829, 2466, 10, 0, 0, 0},
+        { 55447403, 51682540, 19008774, 5928582, 1706884, 595531, 65998, 3661, 101, 0, 0, 8468343},
+        { 28321970, 29149398, 13565882, 5258675, 1868588, 898041, 192023, 21497, 672, 17, 0, 1884921}
+      }
+    }
+  };
+#endif
\ No newline at end of file
diff --git a/vp8/common/defaultcoefcounts.h b/vp8/common/defaultcoefcounts.h
index 7a1e28b..293e742 100644
--- a/vp8/common/defaultcoefcounts.h
+++ b/vp8/common/defaultcoefcounts.h
@@ -18,4 +18,9 @@
                                                  [PREV_COEF_CONTEXTS]
                                                  [MAX_ENTROPY_TOKENS];
 
-#endif //__DEFAULTCOEFCOUNTS_H
+extern const unsigned int vp8_default_coef_counts_8x8[BLOCK_TYPES]
+                                                    [COEF_BANDS]
+                                                    [PREV_COEF_CONTEXTS]
+                                                    [MAX_ENTROPY_TOKENS];
+
+#endif
\ No newline at end of file
diff --git a/vp8/common/entropy.c b/vp8/common/entropy.c
index 5044c25..ca37aab 100644
--- a/vp8/common/entropy.c
+++ b/vp8/common/entropy.c
@@ -59,6 +59,24 @@
     9, 12, 13, 10,
     7, 11, 14, 15,
 };
+#if CONFIG_T8X8
+DECLARE_ALIGNED(64, cuchar, vp8_coef_bands_8x8[64]) = { 0, 1, 2, 3, 5, 4, 4, 5,
+                                                        5, 3, 6, 3, 5, 4, 6, 6,
+                                                        6, 5, 5, 6, 6, 6, 6, 6,
+                                                        6, 6, 6, 6, 6, 6, 6, 6,
+                                                        6, 6, 6, 6, 7, 7, 7, 7,
+                                                        7, 7, 7, 7, 7, 7, 7, 7,
+                                                        7, 7, 7, 7, 7, 7, 7, 7,
+                                                        7, 7, 7, 7, 7, 7, 7, 7
+};
+DECLARE_ALIGNED(64, const int, vp8_default_zig_zag1d_8x8[64]) =
+{
+    0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,
+    12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
+};
+#endif
 
 DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) =
 {
@@ -69,6 +87,9 @@
 };
 
 DECLARE_ALIGNED(16, short, vp8_default_zig_zag_mask[16]);
+#if CONFIG_T8X8
+DECLARE_ALIGNED(64, short, vp8_default_zig_zag_mask_8x8[64]);//int64_t
+#endif
 
 const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6};
 
@@ -99,8 +120,14 @@
 static const Prob Pcat3[] = { 173, 148, 140};
 static const Prob Pcat4[] = { 176, 155, 140, 135};
 static const Prob Pcat5[] = { 180, 157, 141, 134, 130};
+#if CONFIG_EXTEND_QRANGE
 static const Prob Pcat6[] =
 { 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129};
+#else
+static const Prob Pcat6[] =
+{ 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129};
+
+#endif
 
 static vp8_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[22];
 
@@ -112,7 +139,12 @@
     {
         vp8_default_zig_zag_mask[vp8_default_zig_zag1d[i]] = 1 << i;
     }
-
+#if CONFIG_T8X8
+    for (i = 0; i < 64; i++)
+    {
+        vp8_default_zig_zag_mask_8x8[vp8_default_zig_zag1d_8x8[i]] = 1 << i;
+    }
+#endif
 }
 
 static void init_bit_tree(vp8_tree_index *p, int n)
@@ -135,7 +167,11 @@
     init_bit_tree(cat3, 3);
     init_bit_tree(cat4, 4);
     init_bit_tree(cat5, 5);
+#if CONFIG_EXTEND_QRANGE
     init_bit_tree(cat6, 13);
+#else
+    init_bit_tree(cat6, 11);
+#endif
 }
 
 vp8_extra_bit_struct vp8_extra_bits[12] =
@@ -150,7 +186,11 @@
     { cat3, Pcat3, 3, 11},
     { cat4, Pcat4, 4, 19},
     { cat5, Pcat5, 5, 35},
+#if CONFIG_EXTEND_QRANGE
     { cat6, Pcat6, 13, 67},
+#else
+    { cat6, Pcat6, 11, 67},
+#endif
     { 0, 0, 0, 0}
 };
 #include "defaultcoefcounts.h"
@@ -183,6 +223,31 @@
         while (++i < COEF_BANDS);
     }
     while (++h < BLOCK_TYPES);
+#if CONFIG_T8X8
+    h = 0;
+    do
+    {
+        int i = 0;
+
+        do
+        {
+            int k = 0;
+
+            do
+            {
+                unsigned int branch_ct [ENTROPY_NODES] [2];
+                vp8_tree_probs_from_distribution(
+                    MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
+                    pc->fc.coef_probs_8x8 [h][i][k], branch_ct, vp8_default_coef_counts_8x8 [h][i][k],
+                    256, 1);
+
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+        }
+        while (++i < COEF_BANDS);
+    }
+    while (++h < BLOCK_TYPES);
+#endif
 }
 
 
diff --git a/vp8/common/entropy.h b/vp8/common/entropy.h
index 3c25453..d3e841c 100644
--- a/vp8/common/entropy.h
+++ b/vp8/common/entropy.h
@@ -14,7 +14,7 @@
 
 #include "treecoder.h"
 #include "blockd.h"
-
+#include "common.h"
 /* Coefficient token alphabet */
 
 #define ZERO_TOKEN              0       /* 0         Extra Bits 0+0 */
@@ -50,8 +50,11 @@
 #define PROB_UPDATE_BASELINE_COST   7
 
 #define MAX_PROB                255
+#if CONFIG_EXTEND_QRANGE
 #define DCT_MAX_VALUE           8192
-
+#else
+#define DCT_MAX_VALUE           2048
+#endif
 
 /* Coefficients are predicted via a 3-dimensional probability table. */
 
@@ -64,6 +67,9 @@
 
 #define COEF_BANDS 8
 extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]);
+#if CONFIG_T8X8
+extern DECLARE_ALIGNED(64, const unsigned char, vp8_coef_bands_8x8[64]);
+#endif
 
 /* Inside dimension is 3-valued measure of nearby complexity, that is,
    the extent to which nearby coefficients are nonzero.  For the first
@@ -87,14 +93,19 @@
 extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[MAX_ENTROPY_TOKENS]);
 
 extern const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-
+#if CONFIG_T8X8
+extern const vp8_prob vp8_coef_update_probs_8x8 [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+#endif
 
 struct VP8Common;
 void vp8_default_coef_probs(struct VP8Common *);
-
 extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]);
 extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]);
 extern short vp8_default_zig_zag_mask[16];
+#if CONFIG_T8X8
+extern DECLARE_ALIGNED(64, const int, vp8_default_zig_zag1d_8x8[64]);
+extern short vp8_default_zig_zag_mask_8x8[64];//int64_t
+#endif
 extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];
 
 void vp8_coef_tree_initialize(void);
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index 47b13c7..1acc015 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -75,7 +75,13 @@
     rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_c;
     rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_c;
     rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_c;
-
+#if CONFIG_T8X8
+    rtcd->idct.idct8        = vp8_short_idct8x8_c;
+    rtcd->idct.idct8_1      = vp8_short_idct8x8_1_c;
+    rtcd->idct.idct1_scalar_add_8x8 = vp8_dc_only_idct_add_8x8_c;
+    rtcd->idct.ihaar2       = vp8_short_ihaar2x2_c;
+    rtcd->idct.ihaar2_1     = vp8_short_ihaar2x2_1_c;
+#endif
     rtcd->recon.copy16x16   = vp8_copy_mem16x16_c;
     rtcd->recon.copy8x8     = vp8_copy_mem8x8_c;
     rtcd->recon.copy8x4     = vp8_copy_mem8x4_c;
diff --git a/vp8/common/idct.h b/vp8/common/idct.h
index f5fd94d..d1890b9 100644
--- a/vp8/common/idct.h
+++ b/vp8/common/idct.h
@@ -31,6 +31,34 @@
 #include "arm/idct_arm.h"
 #endif
 
+#if CONFIG_T8X8
+#ifndef vp8_idct_idct8
+#define vp8_idct_idct8 vp8_short_idct8x8_c
+#endif
+extern prototype_idct(vp8_idct_idct8);
+
+#ifndef vp8_idct_idct8_1
+#define vp8_idct_idct8_1 vp8_short_idct8x8_1_c
+#endif
+extern prototype_idct(vp8_idct_idct8_1);
+
+#ifndef vp8_idct_ihaar2
+#define vp8_idct_ihaar2 vp8_short_ihaar2x2_c
+#endif
+extern prototype_idct(vp8_idct_ihaar2);
+
+#ifndef vp8_idct_ihaar2_1
+#define vp8_idct_ihaar2_1 vp8_short_ihaar2x2_1_c
+#endif
+extern prototype_idct(vp8_idct_ihaar2_1);
+
+#ifndef vp8_idct_idct1_scalar_add_8x8
+#define vp8_idct_idct1_scalar_add_8x8 vp8_dc_only_idct_add_8x8_c
+#endif
+extern prototype_idct_scalar_add(vp8_idct_idct1_scalar_add_8x8);
+
+#endif
+
 #ifndef vp8_idct_idct1
 #define vp8_idct_idct1 vp8_short_idct4x4llm_1_c
 #endif
@@ -69,6 +97,14 @@
 
     vp8_second_order_fn_t iwalsh1;
     vp8_second_order_fn_t iwalsh16;
+
+#if CONFIG_T8X8
+    vp8_idct_fn_t            idct8;
+    vp8_idct_fn_t            idct8_1;
+    vp8_idct_scalar_add_fn_t idct1_scalar_add_8x8;
+    vp8_idct_fn_t ihaar2;
+    vp8_idct_fn_t ihaar2_1;
+#endif
 } vp8_idct_rtcd_vtable_t;
 
 #if CONFIG_RUNTIME_CPU_DETECT
diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c
index c65d35a..4f3a01b 100644
--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c
@@ -24,9 +24,13 @@
  **************************************************************************/
 #include "vpx_ports/config.h"
 
+
+#include <math.h>
+
 static const int cospi8sqrt2minus1 = 20091;
 static const int sinpi8sqrt2      = 35468;
 static const int rounding = 0;
+
 void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
 {
     int i;
@@ -222,3 +226,312 @@
         op += 4;
     }
 }
+
+#if CONFIG_T8X8
+
+#define FAST_IDCT_8X8
+
+void vp8_short_idct8x8_1_c(short *input, short *output, int pitch)
+{
+    int i, b;
+    int a1;
+    short *op = output;
+    short *orig_op = output;
+    int shortpitch = pitch >> 1;
+    a1 = ((input[0] + 4) >> 3);
+    for (b = 0; b < 4; b++)
+    {
+        for (i = 0; i < 4; i++)
+        {
+            op[0] = a1;
+            op[1] = a1;
+            op[2] = a1;
+            op[3] = a1;
+            op += shortpitch;
+        }
+        op = orig_op + (b+1)%2*4 +(b+1)/2*4*shortpitch;
+    }
+}
+
+void vp8_dc_only_idct_add_8x8_c(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
+{
+    int a1 = ((input_dc + 4) >> 3);
+    int r, c, b;
+    unsigned char *orig_pred = pred_ptr;
+    unsigned char *orig_dst = dst_ptr;
+    for (b = 0; b < 4; b++)
+    {
+        for (r = 0; r < 4; r++)
+        {
+          for (c = 0; c < 4; c++)
+          {
+              int a = a1 + pred_ptr[c] ;
+
+              if (a < 0)
+                 a = 0;
+
+              if (a > 255)
+                 a = 255;
+
+              dst_ptr[c] = (unsigned char) a ;
+         }
+
+         dst_ptr += stride;
+         pred_ptr += pitch;
+       }
+        dst_ptr = orig_dst + (b+1)%2*4 + (b+1)/2*4*stride;
+        pred_ptr = orig_pred + (b+1)%2*4 + (b+1)/2*4*pitch;
+    }
+}
+
+#ifdef FAST_IDCT_8X8
+
+#define W1 2841                 /* 2048*sqrt(2)*cos(1*pi/16) */
+#define W2 2676                 /* 2048*sqrt(2)*cos(2*pi/16) */
+#define W3 2408                 /* 2048*sqrt(2)*cos(3*pi/16) */
+#define W5 1609                 /* 2048*sqrt(2)*cos(5*pi/16) */
+#define W6 1108                 /* 2048*sqrt(2)*cos(6*pi/16) */
+#define W7 565                  /* 2048*sqrt(2)*cos(7*pi/16) */
+
+/* row (horizontal) IDCT
+ *
+ * 7                       pi         1 dst[k] = sum c[l] * src[l] * cos( -- *
+ * ( k + - ) * l ) l=0                      8          2
+ *
+ * where: c[0]    = 128 c[1..7] = 128*sqrt(2) */
+
+static void idctrow (int *blk)
+{
+  int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+
+  /* shortcut */
+  if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |
+        (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3])))
+  {
+    blk[0] = blk[1] = blk[2] = blk[3] = blk[4] = blk[5] = blk[6] = blk[7] = blk[0] << 3;
+    return;
+  }
+  x0 = (blk[0] << 11) + 128;    /* for proper rounding in the fourth stage */
+
+  /* first stage */
+  x8 = W7 * (x4 + x5);
+  x4 = x8 + (W1 - W7) * x4;
+  x5 = x8 - (W1 + W7) * x5;
+  x8 = W3 * (x6 + x7);
+  x6 = x8 - (W3 - W5) * x6;
+  x7 = x8 - (W3 + W5) * x7;
+
+  /* second stage */
+  x8 = x0 + x1;
+  x0 -= x1;
+  x1 = W6 * (x3 + x2);
+  x2 = x1 - (W2 + W6) * x2;
+  x3 = x1 + (W2 - W6) * x3;
+  x1 = x4 + x6;
+  x4 -= x6;
+  x6 = x5 + x7;
+  x5 -= x7;
+
+  /* third stage */
+  x7 = x8 + x3;
+  x8 -= x3;
+  x3 = x0 + x2;
+  x0 -= x2;
+  x2 = (181 * (x4 + x5) + 128) >> 8;
+  x4 = (181 * (x4 - x5) + 128) >> 8;
+
+  /* fourth stage */
+  blk[0] = (x7 + x1) >> 8;
+  blk[1] = (x3 + x2) >> 8;
+  blk[2] = (x0 + x4) >> 8;
+  blk[3] = (x8 + x6) >> 8;
+  blk[4] = (x8 - x6) >> 8;
+  blk[5] = (x0 - x4) >> 8;
+  blk[6] = (x3 - x2) >> 8;
+  blk[7] = (x7 - x1) >> 8;
+}
+
+/* column (vertical) IDCT
+ *
+ * 7                         pi         1 dst[8*k] = sum c[l] * src[8*l] *
+ * cos( -- * ( k + - ) * l ) l=0                        8          2
+ *
+ * where: c[0]    = 1/1024 c[1..7] = (1/1024)*sqrt(2) */
+static void idctcol (int *blk)
+{
+  int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+
+  /* shortcut */
+  if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |
+        (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) | (x7 = blk[8 * 3])))
+  {
+    blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3] = blk[8 * 4] = blk[8 * 5] = blk[8 * 6] = blk[8 * 7] =
+      ((blk[8 * 0] + 32) >> 6);
+    return;
+  }
+  x0 = (blk[8 * 0] << 8) + 8192;
+
+  /* first stage */
+  x8 = W7 * (x4 + x5) + 4;
+  x4 = (x8 + (W1 - W7) * x4) >> 3;
+  x5 = (x8 - (W1 + W7) * x5) >> 3;
+  x8 = W3 * (x6 + x7) + 4;
+  x6 = (x8 - (W3 - W5) * x6) >> 3;
+  x7 = (x8 - (W3 + W5) * x7) >> 3;
+
+  /* second stage */
+  x8 = x0 + x1;
+  x0 -= x1;
+  x1 = W6 * (x3 + x2) + 4;
+  x2 = (x1 - (W2 + W6) * x2) >> 3;
+  x3 = (x1 + (W2 - W6) * x3) >> 3;
+  x1 = x4 + x6;
+  x4 -= x6;
+  x6 = x5 + x7;
+  x5 -= x7;
+
+  /* third stage */
+  x7 = x8 + x3;
+  x8 -= x3;
+  x3 = x0 + x2;
+  x0 -= x2;
+  x2 = (181 * (x4 + x5) + 128) >> 8;
+  x4 = (181 * (x4 - x5) + 128) >> 8;
+
+  /* fourth stage */
+  blk[8 * 0] = (x7 + x1) >> 14;
+  blk[8 * 1] = (x3 + x2) >> 14;
+  blk[8 * 2] = (x0 + x4) >> 14;
+  blk[8 * 3] = (x8 + x6) >> 14;
+  blk[8 * 4] = (x8 - x6) >> 14;
+  blk[8 * 5] = (x0 - x4) >> 14;
+  blk[8 * 6] = (x3 - x2) >> 14;
+  blk[8 * 7] = (x7 - x1) >> 14;
+}
+
+#define TX_DIM 8
+void vp8_short_idct8x8_c(short *coefs, short *block, int pitch)
+// an approximate 8x8 dct implementation, but not used
+{
+    int X[TX_DIM*TX_DIM];
+    int i,j;
+    int shortpitch = pitch >> 1;
+
+    for (i = 0; i < TX_DIM; i++)
+    {
+        for (j = 0; j < TX_DIM; j++)
+        {
+             X[i * TX_DIM + j] = (int)coefs[i * TX_DIM + j];
+        }
+    }
+  for (i = 0; i < 8; i++)
+    idctrow (X + 8 * i);
+
+  for (i = 0; i < 8; i++)
+    idctcol (X + i);
+
+    for (i = 0; i < TX_DIM; i++)
+    {
+        for (j = 0; j < TX_DIM; j++)
+        {
+             block[i*shortpitch+j]  = X[i * TX_DIM + j]>>1;
+        }
+    }
+}
+
+#else
+
+/* This is really for testing */
+void vp8_short_idct8x8_c(short *input, short *output, int pitch)
+{
+    int X[8][8];
+    double C[8][8]={{0.0}}, Ct[8][8]={{0.0}}, temp[8][8]={{0.0}};
+    int i,j,k;
+    double temp1=0.0;
+    double pi = atan( 1.0 ) * 4.0;
+    //static int count=0;
+
+    int shortpitch = pitch >> 1;
+
+    for (i = 0; i < 8; i++)
+    {
+        for (j = 0; j < 8; j++)
+        {
+             X[i][j] = input[i * 8 + j];
+        }
+     }
+
+    // TODO: DCT matrix should be calculated once for all
+    for ( j = 0 ; j < 8 ; j++ ) {
+        C[ 0 ][ j ] = 1.0 / sqrt( (double) 8 );
+        Ct[ j ][ 0 ] = C[ 0 ][ j ];
+    }
+    for ( i = 1 ; i < 8 ; i++ ) {
+        for ( j = 0 ; j < 8 ; j++ ) {
+            C[ i ][ j ] = sqrt( 2.0 / 8 ) *
+                          cos( pi * ( 2 * j + 1 ) * i / ( 2.0 * 8 ) );
+            Ct[ j ][ i ] = C[ i ][ j ];
+        }
+    }
+    /*  MatrixMultiply( temp, input, C ); */
+        for ( i = 0 ; i < 8 ; i++ ) {
+            for ( j = 0 ; j < 8 ; j++ ) {
+                temp[ i ][ j ] = 0.0;
+                for ( k = 0 ; k < 8 ; k++ )
+                    temp[ i ][ j ] += X[ i ][ k ] * C[ k ][ j ];
+            }
+        }
+
+    /*  MatrixMultiply( output, Ct, temp ); */
+        for ( i = 0 ; i < 8 ; i++ ) {
+            for ( j = 0 ; j < 8 ; j++ ) {
+                temp1 = 0.0;
+                for ( k = 0 ; k < 8 ; k++ )
+                    temp1 += Ct[ i ][ k ] * temp[ k ][ j ];
+                X[ i ][ j ] = floor( temp1/ 2.0 + 0.5);
+            }
+        }
+
+    for (i = 0; i < 8; i++)
+    {
+        for (j = 0; j < 8; j++)
+        {
+             output[i*shortpitch+j]  = X[i][j];
+        }
+    }
+}
+#endif
+
+void vp8_short_ihaar2x2_c(short *input, short *output, int pitch)
+{
+   int i, x;
+   short *ip = input; //0,1, 4, 8
+   short *op = output;
+   for (i = 0; i < 16; i++)
+   {
+       op[i] = 0;
+   }
+
+   x = (ip[0] + ip[1] + ip[4] + ip[8]);
+   op[0] = (x>=0?x+1:x-1)>>2;
+   x = (ip[0] - ip[1] + ip[4] - ip[8]);
+   op[1] = (x>=0?x+1:x-1)>>2;
+   x = (ip[0] + ip[1] - ip[4] - ip[8]);
+   op[4] = (x>=0?x+1:x-1)>>2;
+   x = (ip[0] - ip[1] - ip[4] + ip[8]);
+   op[8] = (x>=0?x+1:x-1)>>2;
+}
+
+void vp8_short_ihaar2x2_1_c(short *input, short *output, int pitch)
+{
+   int a1;
+   short *ip = input;
+   short *op = output;
+   a1 = ((ip[0]>=0?ip[0]+1:ip[0]-1) >> 2);
+   op[0] = a1;
+   op[2] = a1;
+   op[8] = a1;
+   op[10] = a1;
+
+}
+#endif
diff --git a/vp8/common/invtrans.c b/vp8/common/invtrans.c
index 81a3f2d..d361b65 100644
--- a/vp8/common/invtrans.c
+++ b/vp8/common/invtrans.c
@@ -24,13 +24,24 @@
     }
 
 }
+#if CONFIG_T8X8
+static void recon_dcblock_8x8(MACROBLOCKD *x)
+{
+    BLOCKD *b = &x->block[24]; //for coeff 0, 2, 8, 10
+    x->block[0].dqcoeff[0] = b->diff[0];
+    x->block[4].dqcoeff[0] = b->diff[1];
+    x->block[8].dqcoeff[0] = b->diff[4];
+    x->block[12].dqcoeff[0] = b->diff[8];
+
+}
+#endif
 
 void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch)
 {
-    if (b->eob > 1)
-        IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->diff, pitch);
-    else
+    if (b->eob <= 1)
         IDCT_INVOKE(rtcd, idct1)(b->dqcoeff, b->diff, pitch);
+    else
+        IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->diff, pitch);
 }
 
 
@@ -86,3 +97,77 @@
     }
 
 }
+
+#if CONFIG_T8X8
+void vp8_inverse_transform_b_8x8(const vp8_idct_rtcd_vtable_t *rtcd, short *input_dqcoeff, short *output_coeff, int pitch)//pay attention to use when 8x8
+{
+    // int b,i;
+    //if (b->eob > 1)
+        IDCT_INVOKE(rtcd, idct8)(input_dqcoeff, output_coeff, pitch);
+    //else
+        //IDCT_INVOKE(rtcd, idct8_1)(b->dqcoeff, b->diff, pitch);//pitch
+
+}
+
+
+void vp8_inverse_transform_mby_8x8(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+{
+    int i;
+
+    // do 2nd order transform on the dc block
+    IDCT_INVOKE(rtcd, ihaar2)(x->block[24].dqcoeff, x->block[24].diff, 8);
+
+    recon_dcblock_8x8(x); //need to change for 8x8
+    for (i = 0; i < 9; i += 8)
+    {
+        vp8_inverse_transform_b_8x8(rtcd, &x->block[i].dqcoeff[0], &x->block[i].diff[0], 32);
+    }
+    for (i = 2; i < 11; i += 8)
+    {
+        vp8_inverse_transform_b_8x8(rtcd, &x->block[i+2].dqcoeff[0], &x->block[i].diff[0], 32);
+    }
+
+}
+void vp8_inverse_transform_mbuv_8x8(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+{
+    int i;
+
+    for (i = 16; i < 24; i += 4)
+    {
+        vp8_inverse_transform_b_8x8(rtcd, &x->block[i].dqcoeff[0], &x->block[i].diff[0], 16);
+    }
+
+}
+
+
+void vp8_inverse_transform_mb_8x8(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+{
+    int i;
+
+    if (x->mode_info_context->mbmi.mode != B_PRED &&
+        x->mode_info_context->mbmi.mode != SPLITMV)
+    {
+        // do 2nd order transform on the dc block
+
+        IDCT_INVOKE(rtcd, ihaar2)(&x->block[24].dqcoeff[0], x->block[24].diff, 8);//dqcoeff[0]
+        recon_dcblock_8x8(x); //need to change for 8x8
+
+    }
+
+    for (i = 0; i < 9; i += 8)
+    {
+        vp8_inverse_transform_b_8x8(rtcd, &x->block[i].dqcoeff[0], &x->block[i].diff[0], 32);
+    }
+    for (i = 2; i < 11; i += 8)
+    {
+        vp8_inverse_transform_b_8x8(rtcd, &x->block[i+2].dqcoeff[0], &x->block[i].diff[0], 32);
+    }
+
+
+    for (i = 16; i < 24; i += 4)
+    {
+        vp8_inverse_transform_b_8x8(rtcd, &x->block[i].dqcoeff[0], &x->block[i].diff[0], 16);
+    }
+
+}
+#endif
diff --git a/vp8/common/invtrans.h b/vp8/common/invtrans.h
index b3ffb70..1466a58 100644
--- a/vp8/common/invtrans.h
+++ b/vp8/common/invtrans.h
@@ -20,4 +20,11 @@
 extern void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
 extern void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
 
+#if CONFIG_T8X8
+extern void vp8_inverse_transform_b_8x8(const vp8_idct_rtcd_vtable_t *rtcd, short *input_dqcoeff, short *output_coeff, int pitch);
+extern void vp8_inverse_transform_mb_8x8(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
+extern void vp8_inverse_transform_mby_8x8(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
+extern void vp8_inverse_transform_mbuv_8x8(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
+#endif
+
 #endif
diff --git a/vp8/common/maskingmv.c b/vp8/common/maskingmv.c
new file mode 100644
index 0000000..d01a18f
--- /dev/null
+++ b/vp8/common/maskingmv.c
@@ -0,0 +1,855 @@
+/*
+ ============================================================================
+ Name        : maskingmv.c
+ Author      : jimbankoski
+ Version     :
+ Copyright   : Your copyright notice
+ Description : Hello World in C, Ansi-style
+ ============================================================================
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+extern unsigned int vp8_sad16x16_sse3(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr,
+    int  ref_stride,
+    int  max_err);
+
+extern void vp8_sad16x16x3_sse3(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr,
+    int  ref_stride,
+    int  *results);
+
+extern int vp8_growmaskmb_sse3(
+    unsigned char *om,
+    unsigned char *nm);
+
+extern void vp8_makemask_sse3(
+    unsigned char *y,
+    unsigned char *u,
+    unsigned char *v,
+    unsigned char *ym,
+    int yp,
+    int uvp,
+    int ys,
+    int us,
+    int vs,
+    int yt,
+    int ut,
+    int vt);
+
+unsigned int vp8_sad16x16_unmasked_wmt(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned char *mask);
+
+unsigned int vp8_sad16x16_masked_wmt(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned char *mask);
+
+unsigned int vp8_masked_predictor_wmt(
+    unsigned char *masked,
+    unsigned char *unmasked,
+    int  src_stride,
+    unsigned char *dst_ptr,
+    int  dst_stride,
+    unsigned char *mask);
+unsigned int vp8_masked_predictor_uv_wmt(
+    unsigned char *masked,
+    unsigned char *unmasked,
+    int  src_stride,
+    unsigned char *dst_ptr,
+    int  dst_stride,
+    unsigned char *mask);
+unsigned int vp8_uv_from_y_mask(
+    unsigned char *ymask,
+    unsigned char *uvmask);
+int yp=16;
+unsigned char sxy[]=
+{
+40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90,
+40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90,
+40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90,
+40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90,
+40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90,
+60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90,
+60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90,
+60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90,
+40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90,
+40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90,
+40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90,
+40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90,
+40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90,
+40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90,
+40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90,
+40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90
+};
+
+unsigned char sts[]=
+{
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+};
+unsigned char str[]=
+{
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+};
+
+unsigned char y[]=
+{
+40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,
+40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,
+40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,
+40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,
+40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,
+60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,40,
+60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,40,
+60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,40,
+40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,
+40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,
+40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,
+40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,
+40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,
+40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,
+40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,
+40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40
+};
+int uvp=8;
+unsigned char u[]=
+{
+90,80,70,70,90,90,90,17,
+90,80,70,70,90,90,90,17,
+84,70,70,90,90,90,17,17,
+84,70,70,90,90,90,17,17,
+80,70,70,90,90,90,17,17,
+90,80,70,70,90,90,90,17,
+90,80,70,70,90,90,90,17,
+90,80,70,70,90,90,90,17
+};
+
+unsigned char v[]=
+{
+80,80,80,80,80,80,80,80,
+80,80,80,80,80,80,80,80,
+80,80,80,80,80,80,80,80,
+80,80,80,80,80,80,80,80,
+80,80,80,80,80,80,80,80,
+80,80,80,80,80,80,80,80,
+80,80,80,80,80,80,80,80,
+80,80,80,80,80,80,80,80
+};
+
+unsigned char ym[256];
+unsigned char uvm[64];
+typedef struct
+{
+    unsigned char y;
+    unsigned char yt;
+    unsigned char u;
+    unsigned char ut;
+    unsigned char v;
+    unsigned char vt;
+    unsigned char use;
+} COLOR_SEG_ELEMENT;
+
+/*
+COLOR_SEG_ELEMENT segmentation[]=
+{
+    { 60,4,80,17,80,10, 1},
+    { 40,4,15,10,80,10, 1},
+};
+*/
+
+COLOR_SEG_ELEMENT segmentation[]=
+{
+    { 79,44,92,44, 237,60, 1},
+};
+
+unsigned char pixel_mask(unsigned char y,unsigned char u,unsigned char v,
+                COLOR_SEG_ELEMENT sgm[],
+                int c)
+{
+    COLOR_SEG_ELEMENT *s=sgm;
+    unsigned char m =0;
+    int i;
+    for(i=0;i<c;i++,s++)
+        m |= ( abs(y-s->y)< s->yt &&
+               abs(u-s->u)< s->ut &&
+               abs(v-s->v)< s->vt ? 255 : 0 );
+
+    return m;
+}
+int neighbors[256][8];
+int makeneighbors(void)
+{
+    int i,j;
+    for(i=0;i<256;i++)
+    {
+        int r=(i>>4),c=(i&15);
+        int ni=0;
+        for(j=0;j<8;j++)
+            neighbors[i][j]=i;
+        for(j=0;j<256;j++)
+        {
+            int nr=(j>>4),nc=(j&15);
+            if(abs(nr-r)<2&&abs(nc-c)<2)
+              neighbors[i][ni++]=j;
+        }
+    }
+    return 0;
+}
+void grow_ymask(unsigned char *ym)
+{
+    unsigned char nym[256];
+    int i,j;
+
+    for(i=0;i<256;i++)
+    {
+        nym[i]=ym[i];
+        for(j=0;j<8;j++)
+        {
+            nym[i]|=ym[neighbors[i][j]];
+        }
+    }
+    for(i=0;i<256;i++)
+        ym[i]=nym[i];
+}
+void make_mb_mask(unsigned char *y, unsigned char *u, unsigned char *v,
+                  unsigned char *ym, unsigned char *uvm,
+                  int yp, int uvp,
+                  COLOR_SEG_ELEMENT sgm[],
+                  int count)
+{
+    int r,c;
+    unsigned char *oym = ym;
+
+    memset(ym,20,256);
+    for(r=0;r<8;r++,uvm+=8,u+=uvp,v+=uvp,y+=(yp+yp),ym+=32)
+        for(c=0;c<8;c++)
+        {
+            int y1=y[c<<1];
+            int u1=u[c];
+            int v1=v[c];
+            int m = pixel_mask(y1,u1,v1,sgm,count);
+            uvm[c] = m;
+            ym[c<<1] = uvm[c];// = pixel_mask(y[c<<1],u[c],v[c],sgm,count);
+            ym[(c<<1)+1] = pixel_mask(y[1+(c<<1)],u[c],v[c],sgm,count);
+            ym[(c<<1)+16] = pixel_mask(y[yp+(c<<1)],u[c],v[c],sgm,count);
+            ym[(c<<1)+17] = pixel_mask(y[1+yp+(c<<1)],u[c],v[c],sgm,count);
+        }
+    grow_ymask(oym);
+}
+
+int masked_sad(unsigned char *src, int p, unsigned char *dst, int dp,
+               unsigned char *ym )
+{
+    int i,j;
+    unsigned sad = 0;
+    for(i=0;i<16;i++,src+=p,dst+=dp,ym+=16)
+        for(j=0;j<16;j++)
+            if(ym[j])
+                sad+= abs(src[j]-dst[j]);
+
+    return sad;
+}
+
+int compare_masks(unsigned char *sym, unsigned char *ym)
+{
+    int i,j;
+    unsigned sad = 0;
+    for(i=0;i<16;i++,sym += 16,ym+=16)
+        for(j=0;j<16;j++)
+            sad+= (sym[j]!=ym[j]?1:0);
+
+    return sad;
+}
+int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp,
+               unsigned char *ym)
+{
+    int i,j;
+    unsigned sad = 0;
+    for(i=0;i<16;i++,src+=p,dst+=dp,ym+=16)
+        for(j=0;j<16;j++)
+            if(!ym[j])
+                sad+= abs(src[j]-dst[j]);
+
+    return sad;
+}
+int masked_motion_search( unsigned char *y, unsigned char *u, unsigned char *v,
+                          int yp, int uvp,
+                          unsigned char *dy, unsigned char *du, unsigned char *dv,
+                          int dyp, int duvp,
+                          COLOR_SEG_ELEMENT sgm[],
+                          int count,
+                          int *mi,
+                          int *mj,
+                          int *ui,
+                          int *uj,
+                          int *wm)
+{
+    int i,j;
+
+    unsigned char ym[256];
+    unsigned char uvm[64];
+    unsigned char dym[256];
+    unsigned char duvm[64];
+    unsigned int e = 0 ;
+    int beste=256;
+    int bmi=-32,bmj=-32;
+    int bui=-32,buj=-32;
+    int beste1=256;
+    int bmi1=-32,bmj1=-32;
+    int bui1=-32,buj1=-32;
+    int obeste;
+
+    // first try finding best mask and then unmasked
+    beste = 0xffffffff;
+
+    // find best unmasked mv
+    for(i=-32;i<32;i++)
+    {
+        unsigned char *dyz = i*dyp + dy;
+        unsigned char *duz = i/2*duvp + du;
+        unsigned char *dvz = i/2*duvp + dv;
+        for(j=-32;j<32;j++)
+        {
+            // 0,0  masked destination
+            make_mb_mask(dyz+j,duz+j/2, dvz+j/2, dym, duvm, dyp, duvp,sgm,count);
+
+            e = unmasked_sad(y, yp, dyz+j, dyp, dym );
+
+            if(e<beste)
+            {
+                bui=i;
+                buj=j;
+                beste=e;
+            }
+        }
+    }
+    //bui=0;buj=0;
+    // best mv masked destination
+    make_mb_mask(dy+bui*dyp+buj,du+bui/2*duvp+buj/2, dv+bui/2*duvp+buj/2,
+                 dym, duvm, dyp, duvp,sgm,count);
+
+    obeste = beste;
+    beste = 0xffffffff;
+
+    // find best masked
+    for(i=-32;i<32;i++)
+    {
+        unsigned char *dyz = i*dyp + dy;
+        for(j=-32;j<32;j++)
+        {
+            e = masked_sad(y, yp, dyz+j, dyp, dym );
+
+            if(e<beste)
+            {
+                bmi=i;
+                bmj=j;
+                beste=e;
+            }
+        }
+    }
+    beste1=beste+obeste;
+    bmi1=bmi;bmj1=bmj;
+    bui1=bui;buj1=buj;
+
+    beste = 0xffffffff;
+    // source mask
+    make_mb_mask(y,u, v, ym, uvm, yp, uvp,sgm,count);
+
+    // find best mask
+    for(i=-32;i<32;i++)
+    {
+        unsigned char *dyz = i*dyp + dy;
+        unsigned char *duz = i/2*duvp + du;
+        unsigned char *dvz = i/2*duvp + dv;
+        for(j=-32;j<32;j++)
+        {
+            // 0,0  masked destination
+            make_mb_mask(dyz+j,duz+j/2, dvz+j/2, dym, duvm, dyp, duvp,sgm,count);
+
+            e = compare_masks(ym, dym);
+
+            if(e<beste)
+            {
+                bmi=i;
+                bmj=j;
+                beste=e;
+            }
+        }
+    }
+
+
+    // best mv masked destination
+    make_mb_mask(dy+bmi*dyp+bmj,du+bmi/2*duvp+bmj/2, dv+bmi/2*duvp+bmj/2,
+                 dym, duvm, dyp, duvp,sgm,count);
+
+    obeste = masked_sad(y, yp, dy+bmi*dyp+bmj, dyp, dym );
+
+    beste = 0xffffffff;
+
+    // find best unmasked mv
+    for(i=-32;i<32;i++)
+    {
+        unsigned char *dyz = i*dyp + dy;
+        for(j=-32;j<32;j++)
+        {
+            e = unmasked_sad(y, yp, dyz+j, dyp, dym );
+
+            if(e<beste)
+            {
+                bui=i;
+                buj=j;
+                beste=e;
+            }
+        }
+    }
+    beste += obeste;
+
+
+    if(beste<beste1)
+    {
+        *mi = bmi;
+        *mj = bmj;
+        *ui = bui;
+        *uj = buj;
+        *wm = 1;
+    }
+    else
+    {
+        *mi = bmi1;
+        *mj = bmj1;
+        *ui = bui1;
+        *uj = buj1;
+        *wm = 0;
+
+    }
+    return 0;
+}
+
+int predict(unsigned char *src, int p, unsigned char *dst, int dp,
+            unsigned char *ym, unsigned char *prd )
+{
+    int i,j;
+    for(i=0;i<16;i++,src+=p,dst+=dp,ym+=16, prd+=16)
+        for(j=0;j<16;j++)
+            prd[j]=(ym[j] ? src[j]:dst[j]);
+    return 0;
+}
+
+int fast_masked_motion_search( unsigned char *y, unsigned char *u, unsigned char *v,
+                          int yp, int uvp,
+                          unsigned char *dy, unsigned char *du, unsigned char *dv,
+                          int dyp, int duvp,
+                          COLOR_SEG_ELEMENT sgm[],
+                          int count,
+                          int *mi,
+                          int *mj,
+                          int *ui,
+                          int *uj,
+                          int *wm)
+{
+    int i,j;
+
+    unsigned char ym[256];
+    unsigned char ym2[256];
+    unsigned char uvm[64];
+    unsigned char dym2[256];
+    unsigned char dym[256];
+    unsigned char duvm[64];
+    unsigned int e = 0 ;
+    int beste=256;
+    int bmi=-32,bmj=-32;
+    int bui=-32,buj=-32;
+    int beste1=256;
+    int bmi1=-32,bmj1=-32;
+    int bui1=-32,buj1=-32;
+    int obeste;
+
+    // first try finding best mask and then unmasked
+    beste = 0xffffffff;
+
+#if 0
+    for(i=0;i<16;i++)
+    {
+        unsigned char *dy = i*yp + y;
+        for(j=0;j<16;j++)
+            printf("%2x",dy[j]);
+        printf("\n");
+    }
+    printf("\n");
+
+    for(i=-32;i<48;i++)
+    {
+        unsigned char *dyz = i*dyp + dy;
+        for(j=-32;j<48;j++)
+            printf("%2x",dyz[j]);
+        printf("\n");
+    }
+#endif
+
+    // find best unmasked mv
+    for(i=-32;i<32;i++)
+    {
+        unsigned char *dyz = i*dyp + dy;
+        unsigned char *duz = i/2*duvp + du;
+        unsigned char *dvz = i/2*duvp + dv;
+        for(j=-32;j<32;j++)
+        {
+            // 0,0  masked destination
+            vp8_makemask_sse3(dyz+j,duz+j/2, dvz+j/2, dym, dyp, duvp,
+                              sgm[0].y,sgm[0].u,sgm[0].v,
+                              sgm[0].yt,sgm[0].ut,sgm[0].vt);
+
+            vp8_growmaskmb_sse3(dym,dym2);
+
+            e = vp8_sad16x16_unmasked_wmt(y, yp, dyz+j, dyp, dym2 );
+
+            if(e<beste)
+            {
+                bui=i;
+                buj=j;
+                beste=e;
+            }
+        }
+    }
+    //bui=0;buj=0;
+    // best mv masked destination
+
+    vp8_makemask_sse3(dy+bui*dyp+buj,du+bui/2*duvp+buj/2, dv+bui/2*duvp+buj/2,
+                      dym, dyp, duvp,
+                      sgm[0].y,sgm[0].u,sgm[0].v,
+                      sgm[0].yt,sgm[0].ut,sgm[0].vt);
+
+    vp8_growmaskmb_sse3(dym,dym2);
+
+    obeste = beste;
+    beste = 0xffffffff;
+
+    // find best masked
+    for(i=-32;i<32;i++)
+    {
+        unsigned char *dyz = i*dyp + dy;
+        for(j=-32;j<32;j++)
+        {
+            e = vp8_sad16x16_masked_wmt(y, yp, dyz+j, dyp, dym2 );
+            if(e<beste)
+            {
+                bmi=i;
+                bmj=j;
+                beste=e;
+            }
+        }
+    }
+    beste1=beste+obeste;
+    bmi1=bmi;bmj1=bmj;
+    bui1=bui;buj1=buj;
+
+    // source mask
+    vp8_makemask_sse3(y,u, v,
+                        ym, yp, uvp,
+                        sgm[0].y,sgm[0].u,sgm[0].v,
+                        sgm[0].yt,sgm[0].ut,sgm[0].vt);
+
+    vp8_growmaskmb_sse3(ym,ym2);
+
+    // find best mask
+    for(i=-32;i<32;i++)
+    {
+        unsigned char *dyz = i*dyp + dy;
+        unsigned char *duz = i/2*duvp + du;
+        unsigned char *dvz = i/2*duvp + dv;
+        for(j=-32;j<32;j++)
+        {
+            // 0,0  masked destination
+            vp8_makemask_sse3(dyz+j,duz+j/2, dvz+j/2, dym, dyp, duvp,
+                              sgm[0].y,sgm[0].u,sgm[0].v,
+                              sgm[0].yt,sgm[0].ut,sgm[0].vt);
+
+            vp8_growmaskmb_sse3(dym,dym2);
+
+            e = compare_masks(ym2, dym2);
+
+            if(e<beste)
+            {
+                bmi=i;
+                bmj=j;
+                beste=e;
+            }
+        }
+    }
+
+    vp8_makemask_sse3(dy+bmi*dyp+bmj,du+bmi/2*duvp+bmj/2, dv+bmi/2*duvp+bmj/2,
+                      dym, dyp, duvp,
+                      sgm[0].y,sgm[0].u,sgm[0].v,
+                      sgm[0].yt,sgm[0].ut,sgm[0].vt);
+
+    vp8_growmaskmb_sse3(dym,dym2);
+
+    obeste = vp8_sad16x16_masked_wmt(y, yp, dy+bmi*dyp+bmj, dyp, dym2 );
+
+    beste = 0xffffffff;
+
+    // find best unmasked mv
+    for(i=-32;i<32;i++)
+    {
+        unsigned char *dyz = i*dyp + dy;
+        for(j=-32;j<32;j++)
+        {
+            e = vp8_sad16x16_unmasked_wmt(y, yp, dyz+j, dyp, dym2 );
+
+            if(e<beste)
+            {
+                bui=i;
+                buj=j;
+                beste=e;
+            }
+        }
+    }
+    beste += obeste;
+
+    if(beste<beste1)
+    {
+        *mi = bmi;
+        *mj = bmj;
+        *ui = bui;
+        *uj = buj;
+        *wm = 1;
+    }
+    else
+    {
+        *mi = bmi1;
+        *mj = bmj1;
+        *ui = bui1;
+        *uj = buj1;
+        *wm = 0;
+        beste=beste1;
+
+    }
+    return beste;
+}
+
+int predict_all(unsigned char *ym, unsigned char *um, unsigned char *vm,
+                int ymp, int uvmp,
+                unsigned char *yp, unsigned char *up, unsigned char *vp,
+                int ypp, int uvpp,
+                COLOR_SEG_ELEMENT sgm[],
+                int count,
+                int mi,
+                int mj,
+                int ui,
+                int uj,
+                int wm)
+{
+    int i,j;
+    unsigned char dym[256];
+    unsigned char dym2[256];
+    unsigned char duvm[64];
+    unsigned char *yu=ym,*uu=um, *vu=vm;
+
+    unsigned char *dym3=dym2;
+
+    ym+=mi*ymp+mj;
+    um+=mi/2*uvmp+mj/2;
+    vm+=mi/2*uvmp+mj/2;
+
+    yu+=ui*ymp+uj;
+    uu+=ui/2*uvmp+uj/2;
+    vu+=ui/2*uvmp+uj/2;
+
+    // best mv masked destination
+    if(wm)
+        vp8_makemask_sse3(ym,um, vm, dym, ymp, uvmp,
+                              sgm[0].y,sgm[0].u,sgm[0].v,
+                              sgm[0].yt,sgm[0].ut,sgm[0].vt);
+    else
+        vp8_makemask_sse3(yu,uu, vu, dym, ymp, uvmp,
+                              sgm[0].y,sgm[0].u,sgm[0].v,
+                              sgm[0].yt,sgm[0].ut,sgm[0].vt);
+
+    vp8_growmaskmb_sse3(dym,dym2);
+    vp8_masked_predictor_wmt(ym,yu,ymp,yp,ypp,dym3);
+    vp8_uv_from_y_mask(dym3,duvm);
+    vp8_masked_predictor_uv_wmt(um,uu,uvmp,up,uvpp,duvm);
+    vp8_masked_predictor_uv_wmt(vm,vu,uvmp,vp,uvpp,duvm);
+
+    return 0;
+}
+
+unsigned char f0p[1280*720*3/2];
+unsigned char f1p[1280*720*3/2];
+unsigned char prd[1280*720*3/2];
+unsigned char msk[1280*720*3/2];
+
+
+int mainz(int argc, char *argv[]) {
+
+    FILE *f=fopen(argv[1],"rb");
+    FILE *g=fopen(argv[2],"wb");
+    int w=atoi(argv[3]),h=atoi(argv[4]);
+    int y_stride=w,uv_stride=w/2;
+    int r,c;
+    unsigned char *f0=f0p,*f1=f1p,*t;
+    unsigned char ym[256],uvm[64];
+    unsigned char ym2[256],uvm2[64];
+    unsigned char ym3[256],uvm3[64];
+    int a,b;
+
+    COLOR_SEG_ELEMENT last={ 20,20,20,20, 230,20, 1},best;
+#if 0
+    makeneighbors();
+    COLOR_SEG_ELEMENT segmentation[]=
+    {
+        { 60,4,80,17,80,10, 1},
+        { 40,4,15,10,80,10, 1},
+    };
+    make_mb_mask(y, u, v,ym2,uvm2,16,8,segmentation,1);
+
+    vp8_makemask_sse3(y,u,v,ym, (int) 16,(int) 8,
+                      (int) segmentation[0].y,(int) segmentation[0].u,(int) segmentation[0].v,
+                      segmentation[0].yt,segmentation[0].ut,segmentation[0].vt);
+
+    vp8_growmaskmb_sse3(ym,ym3);
+
+    a = vp8_sad16x16_masked_wmt(str,16,sts,16,ym3);
+    b = vp8_sad16x16_unmasked_wmt(str,16,sts,16,ym3);
+
+    vp8_masked_predictor_wmt(str,sts,16,ym,16,ym3);
+
+    vp8_uv_from_y_mask(ym3,uvm3);
+
+    return 4;
+#endif
+    makeneighbors();
+
+
+    memset(prd,128,w*h*3/2);
+
+    fread(f0,w*h*3/2,1,f);
+
+    while(!feof(f))
+    {
+        unsigned char *ys=f1,*yd=f0,*yp=prd;
+        unsigned char *us=f1+w*h,*ud=f0+w*h,*up=prd+w*h;
+        unsigned char *vs=f1+w*h*5/4,*vd=f0+w*h*5/4,*vp=prd+w*h*5/4;
+        fread(f1,w*h*3/2,1,f);
+
+        ys+=32*y_stride;yd+=32*y_stride;yp+=32*y_stride;
+        us+=16*uv_stride;ud+=16*uv_stride;up+=16*uv_stride;
+        vs+=16*uv_stride;vd+=16*uv_stride;vp+=16*uv_stride;
+        for(r=32;r<h-32;r+=16,
+            ys+=16*w,yd+=16*w,yp+=16*w,
+            us+=8*uv_stride,ud+=8*uv_stride,up+=8*uv_stride,
+            vs+=8*uv_stride,vd+=8*uv_stride,vp+=8*uv_stride)
+        {
+            for(c=32;c<w-32;c+=16)
+            {
+                int mi,mj,ui,uj,wm;
+                int bmi,bmj,bui,buj,bwm;
+                unsigned char ym[256];
+
+                if(vp8_sad16x16_sse3( ys+c,y_stride, yd+c,y_stride,0xffff) == 0)
+                    bmi=bmj=bui=buj=bwm=0;
+                else
+                {
+                    COLOR_SEG_ELEMENT cs[5];
+                    int j;
+                    unsigned int beste=0xfffffff;
+                    unsigned int bestj=0;
+
+                    // try color from last mb segmentation
+                    cs[0] = last;
+
+                    // try color segs from 4 pixels in mb recon as segmentation
+                    cs[1].y = yd[c + y_stride + 1];cs[1].u = ud[c/2 + uv_stride];
+                    cs[1].v = vd[c/2 + uv_stride];
+                    cs[1].yt = cs[1].ut = cs[1].vt = 20;
+                    cs[2].y = yd[c + w + 14];
+                    cs[2].u = ud[c/2 + uv_stride+7];
+                    cs[2].v = vd[c/2 + uv_stride+7];
+                    cs[2].yt = cs[2].ut = cs[2].vt = 20;
+                    cs[3].y = yd[c + w*14 + 1];
+                    cs[3].u = ud[c/2 + uv_stride*7];
+                    cs[3].v = vd[c/2 + uv_stride*7];
+                    cs[3].yt = cs[3].ut = cs[3].vt = 20;
+                    cs[4].y = yd[c + w*14 + 14];
+                    cs[4].u = ud[c/2 + uv_stride*7+7];
+                    cs[4].v = vd[c/2 + uv_stride*7+7];
+                    cs[4].yt = cs[4].ut = cs[4].vt = 20;
+
+                    for(j=0;j<5;j++)
+                    {
+                        int e;
+
+                        e = fast_masked_motion_search(
+                           ys+c, us+c/2, vs+c/2, y_stride, uv_stride,
+                           yd+c, ud+c/2, vd+c/2, y_stride, uv_stride,
+                           &cs[j], 1, &mi,&mj,&ui,&uj,&wm);
+
+                        if(e<beste)
+                        {
+                            bmi=mi;bmj=mj;bui=ui;buj=uj,bwm=wm;
+                            bestj=j;
+                            beste=e;
+                        }
+                    }
+                    best = cs[bestj];
+                    //best = segmentation[0];
+                    last = best;
+                }
+                predict_all(yd+c, ud+c/2, vd+c/2, w, uv_stride,
+                            yp+c, up+c/2, vp+c/2, w, uv_stride,
+                            &best, 1, bmi,bmj,bui,buj,bwm);
+
+            }
+        }
+        fwrite(prd,w*h*3/2,1,g);
+        t=f0;
+        f0=f1;
+        f1=t;
+
+    }
+    fclose(f);
+    fclose(g);
+	return;
+}
diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index 11a830f..f84bd6e 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -46,6 +46,9 @@
     vp8_prob uv_mode_prob [VP8_UV_MODES-1];
     vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1];
     vp8_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+#if CONFIG_T8X8
+    vp8_prob coef_probs_8x8 [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+#endif
     MV_CONTEXT mvc[2];
     MV_CONTEXT pre_mvc[2];  /* not to caculate the mvcost for the frame if mvc doesn't change. */
 } FRAME_CONTEXT;
diff --git a/vp8/common/x86/mask_sse3.asm b/vp8/common/x86/mask_sse3.asm
new file mode 100644
index 0000000..0d90cfa
--- /dev/null
+++ b/vp8/common/x86/mask_sse3.asm
@@ -0,0 +1,484 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void int vp8_makemask_sse3(
+;    unsigned char *y,
+;    unsigned char *u,
+;    unsigned char *v,
+;    unsigned char *ym,
+;    unsigned char *uvm,
+;    int yp,
+;    int uvp,
+;    int ys,
+;    int us,
+;    int vs,
+;    int yt,
+;    int ut,
+;    int vt)
+global sym(vp8_makemask_sse3)
+sym(vp8_makemask_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 14
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;y
+        mov             rdi,        arg(1) ;u
+        mov             rcx,        arg(2) ;v
+        mov             rax,        arg(3) ;ym
+        movsxd          rbx,        dword arg(4) ;yp
+        movsxd          rdx,        dword arg(5) ;uvp
+
+        pxor            xmm0,xmm0
+
+        ;make 16 copies of the center y value
+        movd            xmm1, arg(6)
+        pshufb          xmm1, xmm0
+
+        ; make 16 copies of the center u value
+        movd            xmm2, arg(7)
+        pshufb          xmm2, xmm0
+
+        ; make 16 copies of the center v value
+        movd            xmm3, arg(8)
+        pshufb          xmm3, xmm0
+        unpcklpd        xmm2, xmm3
+
+        ;make 16 copies of the y tolerance
+        movd            xmm3, arg(9)
+        pshufb          xmm3, xmm0
+
+        ;make 16 copies of the u tolerance
+        movd            xmm4, arg(10)
+        pshufb          xmm4, xmm0
+
+        ;make 16 copies of the v tolerance
+        movd            xmm5, arg(11)
+        pshufb          xmm5, xmm0
+        unpckhpd        xmm4, xmm5
+
+        mov             r8,8
+
+NextPairOfRows:
+
+        ;grab the y source values
+        movdqu          xmm0, [rsi]
+
+        ;compute abs difference between source and y target
+        movdqa          xmm6, xmm1
+        movdqa          xmm7, xmm0
+        psubusb         xmm0, xmm1
+        psubusb         xmm6, xmm7
+        por             xmm0, xmm6
+
+        ;compute abs difference between
+        movdqa          xmm6, xmm3
+        pcmpgtb         xmm6, xmm0
+
+        ;grab the y source values
+        add             rsi, rbx
+        movdqu          xmm0, [rsi]
+
+        ;compute abs difference between source and y target
+        movdqa          xmm11, xmm1
+        movdqa          xmm7, xmm0
+        psubusb         xmm0, xmm1
+        psubusb         xmm11, xmm7
+        por             xmm0, xmm11
+
+        ;compute abs difference between
+        movdqa          xmm11, xmm3
+        pcmpgtb         xmm11, xmm0
+
+
+        ;grab the u and v source values
+        movdqu          xmm7, [rdi]
+        movdqu          xmm8, [rcx]
+        unpcklpd        xmm7, xmm8
+
+        ;compute abs difference between source and uv targets
+        movdqa          xmm9, xmm2
+        movdqa          xmm10, xmm7
+        psubusb         xmm7, xmm2
+        psubusb         xmm9, xmm10
+        por             xmm7, xmm9
+
+        ;check whether the number is < tolerance
+        movdqa          xmm0, xmm4
+        pcmpgtb         xmm0, xmm7
+
+        ;double  u and v masks
+        movdqa          xmm8, xmm0
+        punpckhbw       xmm0, xmm0
+        punpcklbw       xmm8, xmm8
+
+        ;mask row 0 and output
+        pand            xmm6, xmm8
+        pand            xmm6, xmm0
+        movdqa          [rax],xmm6
+
+        ;mask row 1 and output
+        pand            xmm11, xmm8
+        pand            xmm11, xmm0
+        movdqa          [rax+16],xmm11
+
+
+        ; to the next row or set of rows
+        add             rsi, rbx
+        add             rdi, rdx
+        add             rcx, rdx
+        add             rax,32
+        dec r8
+        jnz NextPairOfRows
+
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;GROW_HORIZ (register for result, source register or mem local)
+; takes source and shifts left and ors with source
+; then shifts right and ors with source
+%macro GROW_HORIZ 2
+    movdqa          %1, %2
+    movdqa          xmm14, %1
+    movdqa          xmm15, %1
+    pslldq          xmm14, 1
+    psrldq          xmm15, 1
+    por             %1,xmm14
+    por             %1,xmm15
+%endmacro
+;GROW_VERT (result, center row, above row, below row)
+%macro GROW_VERT 4
+    movdqa          %1,%2
+    por             %1,%3
+    por             %1,%4
+%endmacro
+
+;GROW_NEXTLINE (new line to grow, new source, line to write)
+%macro GROW_NEXTLINE 3
+    GROW_HORIZ %1, %2
+    GROW_VERT xmm3, xmm0, xmm1, xmm2
+    movdqa %3,xmm3
+%endmacro
+
+
+;void int vp8_growmaskmb_sse3(
+;    unsigned char *om,
+;    unsigned char *nm,
+global sym(vp8_growmaskmb_sse3)
+sym(vp8_growmaskmb_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;src
+    mov             rdi,        arg(1) ;rst
+
+    GROW_HORIZ xmm0, [rsi]
+    GROW_HORIZ xmm1, [rsi+16]
+    GROW_HORIZ xmm2, [rsi+32]
+
+    GROW_VERT xmm3, xmm0, xmm1, xmm2
+    por xmm0,xmm1
+    movdqa [rdi], xmm0
+    movdqa [rdi+16],xmm3
+
+    GROW_NEXTLINE xmm0,[rsi+48],[rdi+32]
+    GROW_NEXTLINE xmm1,[rsi+64],[rdi+48]
+    GROW_NEXTLINE xmm2,[rsi+80],[rdi+64]
+    GROW_NEXTLINE xmm0,[rsi+96],[rdi+80]
+    GROW_NEXTLINE xmm1,[rsi+112],[rdi+96]
+    GROW_NEXTLINE xmm2,[rsi+128],[rdi+112]
+    GROW_NEXTLINE xmm0,[rsi+144],[rdi+128]
+    GROW_NEXTLINE xmm1,[rsi+160],[rdi+144]
+    GROW_NEXTLINE xmm2,[rsi+176],[rdi+160]
+    GROW_NEXTLINE xmm0,[rsi+192],[rdi+176]
+    GROW_NEXTLINE xmm1,[rsi+208],[rdi+192]
+    GROW_NEXTLINE xmm2,[rsi+224],[rdi+208]
+    GROW_NEXTLINE xmm0,[rsi+240],[rdi+224]
+
+    por xmm0,xmm2
+    movdqa [rdi+240], xmm0
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;unsigned int vp8_sad16x16_masked_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned char *mask)
+global sym(vp8_sad16x16_masked_wmt)
+sym(vp8_sad16x16_masked_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+    mov             rsi,        arg(0) ;src_ptr
+    mov             rdi,        arg(2) ;ref_ptr
+
+    mov             rbx,        arg(4) ;mask
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+    mov             rcx,        16
+
+    pxor            xmm3,       xmm3
+
+NextSadRow:
+    movdqu          xmm0,       [rsi]
+    movdqu          xmm1,       [rdi]
+    movdqu          xmm2,       [rbx]
+    pand            xmm0,       xmm2
+    pand            xmm1,       xmm2
+
+    psadbw          xmm0,       xmm1
+    paddw           xmm3,       xmm0
+
+    add             rsi, rax
+    add             rdi, rdx
+    add             rbx,  16
+
+    dec rcx
+    jnz NextSadRow
+
+    movdqa          xmm4 ,     xmm3
+    psrldq          xmm4,       8
+    paddw           xmm3,      xmm4
+    movq            rax,       xmm3
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad16x16_unmasked_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned char *mask)
+global sym(vp8_sad16x16_unmasked_wmt)
+sym(vp8_sad16x16_unmasked_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+    mov             rsi,        arg(0) ;src_ptr
+    mov             rdi,        arg(2) ;ref_ptr
+
+    mov             rbx,        arg(4) ;mask
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+    mov             rcx,        16
+
+    pxor            xmm3,       xmm3
+
+next_vp8_sad16x16_unmasked_wmt:
+    movdqu          xmm0,       [rsi]
+    movdqu          xmm1,       [rdi]
+    movdqu          xmm2,       [rbx]
+    por             xmm0,       xmm2
+    por             xmm1,       xmm2
+
+    psadbw          xmm0,       xmm1
+    paddw           xmm3,       xmm0
+
+    add             rsi, rax
+    add             rdi, rdx
+    add             rbx,  16
+
+    dec rcx
+    jnz next_vp8_sad16x16_unmasked_wmt
+
+    movdqa          xmm4 ,     xmm3
+    psrldq          xmm4,       8
+    paddw           xmm3,      xmm4
+    movq            rax,        xmm3
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_masked_predictor_wmt(
+;    unsigned char *masked,
+;    unsigned char *unmasked,
+;    int  src_stride,
+;    unsigned char *dst_ptr,
+;    int  dst_stride,
+;    unsigned char *mask)
+global sym(vp8_masked_predictor_wmt)
+sym(vp8_masked_predictor_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+    mov             rsi,        arg(0) ;src_ptr
+    mov             rdi,        arg(1) ;ref_ptr
+
+    mov             rbx,        arg(5) ;mask
+    movsxd          rax,        dword ptr arg(2) ;src_stride
+    mov             r11,        arg(3) ; destination
+    movsxd          rdx,        dword ptr arg(4) ;dst_stride
+
+    mov             rcx,        16
+
+    pxor            xmm3,       xmm3
+
+next_vp8_masked_predictor_wmt:
+    movdqu          xmm0,       [rsi]
+    movdqu          xmm1,       [rdi]
+    movdqu          xmm2,       [rbx]
+
+    pand            xmm0,       xmm2
+    pandn           xmm2,       xmm1
+    por             xmm0,       xmm2
+    movdqu          [r11],      xmm0
+
+    add             r11, rdx
+    add             rsi, rax
+    add             rdi, rdx
+    add             rbx,  16
+
+    dec rcx
+    jnz next_vp8_masked_predictor_wmt
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;unsigned int vp8_masked_predictor_uv_wmt(
+;    unsigned char *masked,
+;    unsigned char *unmasked,
+;    int  src_stride,
+;    unsigned char *dst_ptr,
+;    int  dst_stride,
+;    unsigned char *mask)
+global sym(vp8_masked_predictor_uv_wmt)
+sym(vp8_masked_predictor_uv_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+    mov             rsi,        arg(0) ;src_ptr
+    mov             rdi,        arg(1) ;ref_ptr
+
+    mov             rbx,        arg(5) ;mask
+    movsxd          rax,        dword ptr arg(2) ;src_stride
+    mov             r11,        arg(3) ; destination
+    movsxd          rdx,        dword ptr arg(4) ;dst_stride
+
+    mov             rcx,        8
+
+    pxor            xmm3,       xmm3
+
+next_vp8_masked_predictor_uv_wmt:
+    movq            xmm0,       [rsi]
+    movq            xmm1,       [rdi]
+    movq            xmm2,       [rbx]
+
+    pand            xmm0,       xmm2
+    pandn           xmm2,       xmm1
+    por             xmm0,       xmm2
+    movq            [r11],      xmm0
+
+    add             r11, rdx
+    add             rsi, rax
+    add             rdi, rax
+    add             rbx,  8
+
+    dec rcx
+    jnz next_vp8_masked_predictor_uv_wmt
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_uv_from_y_mask(
+;    unsigned char *ymask,
+;    unsigned char *uvmask)
+global sym(vp8_uv_from_y_mask)
+sym(vp8_uv_from_y_mask):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+    mov             rsi,        arg(0) ;src_ptr
+    mov             rdi,        arg(1) ;dst_ptr
+
+
+    mov             rcx,        8
+
+    pxor            xmm3,       xmm3
+
+next_p8_uv_from_y_mask:
+    movdqu          xmm0,       [rsi]
+    pshufb          xmm0, [shuf1b] ;[GLOBAL(shuf1b)]
+    movq            [rdi],xmm0
+    add             rdi, 8
+    add             rsi,32
+
+    dec rcx
+    jnz next_p8_uv_from_y_mask
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+shuf1b:
+    db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
+
diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index 0a7942d..f23fd00 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -208,11 +208,13 @@
 
 
 
-
 static void mb_mode_mv_init(VP8D_COMP *pbi)
 {
     vp8_reader *const bc = & pbi->bc;
     MV_CONTEXT *const mvc = pbi->common.fc.mvc;
+#if CONFIG_SEGMENTATION
+    MACROBLOCKD *const xd  = & pbi->mb;
+#endif
 
 #if CONFIG_ERROR_CONCEALMENT
     /* Default is that no macroblock is corrupt, therefore we initialize
@@ -253,6 +255,9 @@
         }
 
         read_mvcontexts(bc, mvc);
+#if CONFIG_SEGMENTATION
+    xd->temporal_update = vp8_read_bit(bc);
+#endif
     }
 }
 
@@ -263,7 +268,11 @@
     vp8_reader *const bc = & pbi->bc;
     MV_CONTEXT *const mvc = pbi->common.fc.mvc;
     const int mis = pbi->common.mode_info_stride;
-
+#if CONFIG_SEGMENTATION
+    MACROBLOCKD *const xd  = & pbi->mb;
+    int sum;
+    int index = mb_row * pbi->common.mb_cols + mb_col;
+#endif
     int_mv *const mv = & mbmi->mv;
     int mb_to_left_edge;
     int mb_to_right_edge;
@@ -274,7 +283,6 @@
     mb_to_bottom_edge = pbi->mb.mb_to_bottom_edge;
     mb_to_top_edge -= LEFT_TOP_MARGIN;
     mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN;
-
     mbmi->need_to_clamp_mvs = 0;
     /* Distance of Mb to the various image edges.
      * These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units
@@ -289,7 +297,41 @@
 
     /* If required read in new segmentation data for this MB */
     if (pbi->mb.update_mb_segmentation_map)
-        vp8_read_mb_features(bc, mbmi, &pbi->mb);
+            {
+#if CONFIG_SEGMENTATION
+                if (xd->temporal_update)
+                {
+                    sum = 0;
+
+                    if (mb_col != 0)
+                        sum += (mi-1)->mbmi.segment_flag;
+                    if (mb_row != 0)
+                        sum += (mi-pbi->common.mb_cols)->mbmi.segment_flag;
+
+                    if (vp8_read(bc, xd->mb_segment_tree_probs[3+sum]) == 0)
+                    {
+                        mbmi->segment_id = pbi->segmentation_map[index];
+                        mbmi->segment_flag = 0;
+                    }
+                    else
+                    {
+                        vp8_read_mb_features(bc, &mi->mbmi, &pbi->mb);
+                        mbmi->segment_flag = 1;
+                        pbi->segmentation_map[index] = mbmi->segment_id;
+                    }
+
+                }
+                else
+                {
+                    vp8_read_mb_features(bc, &mi->mbmi, &pbi->mb);
+                    pbi->segmentation_map[index] = mbmi->segment_id;
+                }
+                index++;
+#else
+                vp8_read_mb_features(bc, &mi->mbmi, &pbi->mb);
+#endif
+            }
+
 
     /* Read the macroblock coeff skip flag if this feature is in use, else default to 0 */
     if (pbi->common.mb_no_coeff_skip)
@@ -508,4 +550,3 @@
         mi++;           /* skip left predictor each row */
     }
 }
-
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 9b35ffd..c6cea3a 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -40,6 +40,10 @@
 #include <assert.h>
 #include <stdio.h>
 
+#ifdef DEC_DEBUG
+int dec_debug = 0;
+#endif
+
 void vp8cx_init_de_quantizer(VP8D_COMP *pbi)
 {
     int i;
@@ -125,6 +129,16 @@
         vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
                                            xd->dst.u_buffer, xd->dst.v_buffer,
                                            xd->dst.y_stride, xd->dst.uv_stride);
+#ifdef DEC_DEBUG
+        if (dec_debug) {
+          int i, j;
+          printf("Generating predictors\n");
+          for (i=0;i<16;i++) {
+            for (j=0;j<16;j++) printf("%3d ", xd->dst.y_buffer[i*xd->dst.y_stride+j]);
+            printf("\n");
+          }
+        }
+#endif
     }
 }
 
@@ -192,7 +206,28 @@
     }
     else
     {
-        eobtotal = vp8_decode_mb_tokens(pbi, xd);
+
+#if CONFIG_T8X8
+        for(i = 0; i < 25; i++)
+        {
+            xd->block[i].eob = 0;
+            xd->eobs[i] = 0;
+        }
+        if (xd->mode_info_context->mbmi.segment_id >= 2)
+            eobtotal = vp8_decode_mb_tokens_8x8(pbi, xd);
+        else
+#endif
+            eobtotal = vp8_decode_mb_tokens(pbi, xd);
+#ifdef DEC_DEBUG
+        if (dec_debug) {
+            printf("\nTokens (%d)\n", eobtotal);
+            for (i =0; i<400; i++) {
+                printf("%3d ", xd->qcoeff[i]);
+                if (i%16 == 15) printf("\n");
+            }
+            printf("\n");
+        }
+#endif
     }
 
     /* Perform temporary clamping of the MV to be used for prediction */
@@ -276,10 +311,22 @@
     }
     else if (mode == SPLITMV)
     {
-        DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
-                        (xd->qcoeff, xd->block[0].dequant,
-                         xd->predictor, xd->dst.y_buffer,
-                         xd->dst.y_stride, xd->eobs);
+#if CONFIG_T8X8
+        if(xd->mode_info_context->mbmi.segment_id >= 2)
+        {
+            DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block_8x8)
+                (xd->qcoeff, xd->block[0].dequant,
+                xd->predictor, xd->dst.y_buffer,
+                xd->dst.y_stride, xd->eobs, xd);
+        }
+        else
+#endif
+        {
+            DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
+                (xd->qcoeff, xd->block[0].dequant,
+                xd->predictor, xd->dst.y_buffer,
+                xd->dst.y_stride, xd->eobs);
+        }
     }
     else
     {
@@ -288,10 +335,23 @@
         DEQUANT_INVOKE(&pbi->dequant, block)(b);
 
         /* do 2nd order transform on the dc block */
-        if (xd->eobs[24] > 1)
+#if CONFIG_T8X8
+        if(xd->mode_info_context->mbmi.segment_id >= 2)
         {
-            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
-            ((int *)b->qcoeff)[0] = 0;
+            DEQUANT_INVOKE(&pbi->dequant, block_8x8)(b);
+#ifdef DEC_DEBUG
+            if (dec_debug)
+            {
+                int j;
+                printf("DQcoeff Haar\n");
+                for (j=0;j<16;j++) {
+                    printf("%d ", b->dqcoeff[j]);
+                }
+                printf("\n");
+            }
+#endif
+            IDCT_INVOKE(RTCD_VTABLE(idct), ihaar2)(&b->dqcoeff[0], b->diff, 8);
+            ((int *)b->qcoeff)[0] = 0;//2nd order block are set to 0 after inverse transform
             ((int *)b->qcoeff)[1] = 0;
             ((int *)b->qcoeff)[2] = 0;
             ((int *)b->qcoeff)[3] = 0;
@@ -299,23 +359,55 @@
             ((int *)b->qcoeff)[5] = 0;
             ((int *)b->qcoeff)[6] = 0;
             ((int *)b->qcoeff)[7] = 0;
-        }
-        else
-        {
-            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
-            ((int *)b->qcoeff)[0] = 0;
+            DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block_8x8)
+                (xd->qcoeff, xd->block[0].dequant,
+                xd->predictor, xd->dst.y_buffer,
+                xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
+
         }
 
-        DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
-                        (xd->qcoeff, xd->block[0].dequant,
-                         xd->predictor, xd->dst.y_buffer,
-                         xd->dst.y_stride, xd->eobs, xd->block[24].diff);
+        else
+#endif
+            if (xd->eobs[24] > 1)
+            {
+                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
+                ((int *)b->qcoeff)[0] = 0;
+                ((int *)b->qcoeff)[1] = 0;
+                ((int *)b->qcoeff)[2] = 0;
+                ((int *)b->qcoeff)[3] = 0;
+                ((int *)b->qcoeff)[4] = 0;
+                ((int *)b->qcoeff)[5] = 0;
+                ((int *)b->qcoeff)[6] = 0;
+                ((int *)b->qcoeff)[7] = 0;
+            }
+            else
+            {
+                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
+                ((int *)b->qcoeff)[0] = 0;
+            }
+
+            DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
+                (xd->qcoeff, xd->block[0].dequant,
+                xd->predictor, xd->dst.y_buffer,
+                xd->dst.y_stride, xd->eobs, xd->block[24].diff);
     }
+#if CONFIG_T8X8
+    if(xd->mode_info_context->mbmi.segment_id >= 2)
+    {
+        DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block_8x8)//
+            (xd->qcoeff+16*16, xd->block[16].dequant,
+            xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
+            xd->dst.uv_stride, xd->eobs+16, xd);//
+
+    }
+    else
+#endif
 
     DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
                     (xd->qcoeff+16*16, xd->block[16].dequant,
                      xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
                      xd->dst.uv_stride, xd->eobs+16);
+
 }
 
 
@@ -423,6 +515,9 @@
 
         vp8_build_uvmvs(xd, pc->full_pixel);
 
+#ifdef DEC_DEBUG
+        dec_debug = (pc->current_video_frame==5 && mb_row==2 && mb_col==3);
+#endif
         /*
         if(pc->current_video_frame==0 &&mb_col==1 && mb_row==0)
         pbi->debugoutput =1;
@@ -433,7 +528,6 @@
 
         /* check if the boolean decoder has suffered an error */
         xd->corrupted |= vp8dx_bool_error(xd->current_bc);
-
         recon_yoffset += 16;
         recon_uvoffset += 8;
 
@@ -787,7 +881,6 @@
 
     /* Is segmentation enabled */
     xd->segmentation_enabled = (unsigned char)vp8_read_bit(bc);
-
     if (xd->segmentation_enabled)
     {
         /* Signal whether or not the segmentation map is being explicitly updated this frame. */
@@ -823,9 +916,12 @@
         {
             /* Which macro block level features are enabled */
             vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
-
+#if CONFIG_SEGMENTATION
             /* Read the probs used to decode the segment id for each macro block. */
+            for (i = 0; i < MB_FEATURE_TREE_PROBS+3; i++)
+#else
             for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
+#endif
             {
                 /* If not explicitly set value is defaulted to 255 by memset above */
                 if (vp8_read_bit(bc))
@@ -991,10 +1087,36 @@
                         }
                     }
     }
+#if CONFIG_T8X8
+    {
+        // read coef probability tree
+
+        for (i = 0; i < BLOCK_TYPES; i++)
+            for (j = 0; j < COEF_BANDS; j++)
+                for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+                    for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++)
+                    {
+
+                        vp8_prob *const p = pc->fc.coef_probs_8x8 [i][j][k] + l;
+
+                        if (vp8_read(bc, vp8_coef_update_probs_8x8 [i][j][k][l]))
+                        {
+                            *p = (vp8_prob)vp8_read_literal(bc, 8);
+
+                        }
+                    }
+    }
+#endif
 
     vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->lst_fb_idx], sizeof(YV12_BUFFER_CONFIG));
     vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));
 
+#if CONFIG_SEGMENTATION
+     // Create the encoder segmentation map and set all entries to 0
+     if (!pbi->segmentation_map)
+       CHECK_MEM_ERROR(pbi->segmentation_map, vpx_calloc((pc->mb_rows * pc->mb_cols), 1));
+#endif
+
     /* set up frame new frame for intra coded blocks */
 #if CONFIG_MULTITHREAD
     if (!(pbi->b_multithreaded_rd) || pc->multi_token_partition == ONE_PARTITION || !(pc->filter_level))
@@ -1088,6 +1210,7 @@
         fclose(f);
     }
 #endif
+    //printf("Frame %d Done\n", frame_count++);
 
     return 0;
 }
diff --git a/vp8/decoder/dequantize.c b/vp8/decoder/dequantize.c
index dd0c13b..956acba 100644
--- a/vp8/decoder/dequantize.c
+++ b/vp8/decoder/dequantize.c
@@ -13,13 +13,22 @@
 #include "dequantize.h"
 #include "vp8/common/idct.h"
 #include "vpx_mem/vpx_mem.h"
+#include "onyxd_int.h"
 
 extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) ;
 extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
+#if CONFIG_T8X8
+extern void vp8_short_idct8x8_c(short *input, short *output, int pitch);
+extern void vp8_short_idct8x8_1_c(short *input, short *output, int pitch);
+#endif
 
+#ifdef DEC_DEBUG
+extern int dec_debug;
+#endif
 
 void vp8_dequantize_b_c(BLOCKD *d)
 {
+
     int i;
     short *DQ  = d->dqcoeff;
     short *Q   = d->qcoeff;
@@ -111,3 +120,211 @@
         pred += pitch;
     }
 }
+
+#if CONFIG_T8X8
+void vp8_dequantize_b_8x8_c(BLOCKD *d)//just for 2x2 haar transform
+{
+    int i;
+    short *DQ  = d->dqcoeff;
+    short *Q   = d->qcoeff;
+    short *DQC = d->dequant;
+
+    for (i = 0; i < 16; i++)
+    {
+       DQ[i] = (short)(Q[i] * DQC[i]);
+    }
+#ifdef DEC_DEBUG
+    if (dec_debug) {
+      int j;
+      printf("Dequantize 2x2\n");
+      for (j=0;j<16;j++) printf("%d ", Q[j]); printf("\n");
+      for (j=0;j<16;j++) printf("%d ", DQ[j]); printf("\n");
+    }
+#endif
+}
+
+void vp8_dequant_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
+                                unsigned char *dest, int pitch, int stride)//, MACROBLOCKD *xd, short blk_idx
+{
+    short output[64];
+    short *diff_ptr = output;
+    int r, c, b;
+    int i;
+    unsigned char *origdest = dest;
+    unsigned char *origpred = pred;
+
+#ifdef DEC_DEBUG
+    if (dec_debug) {
+      int j;
+      printf("Input 8x8\n");
+      for (j=0;j<64;j++) {
+        printf("%d ", input[j]);
+        if (j%8 == 7) printf("\n");
+      }
+    }
+#endif
+    // recover quantizer for 4 4x4 blocks
+    for (i = 0; i < 64; i++)
+    {
+      input[i]=input[i] * dq[i!=0];
+    }
+#ifdef DEC_DEBUG
+    if (dec_debug) {
+      int j;
+      printf("Input DQ 8x8\n");
+      for (j=0;j<64;j++) {
+        printf("%d ", input[j]);
+        if (j%8 == 7) printf("\n");
+      }
+    }
+#endif
+
+    // the idct halves ( >> 1) the pitch
+    vp8_short_idct8x8_c(input, output, 16);
+#ifdef DEC_DEBUG
+    if (dec_debug) {
+      int j;
+      printf("Output 8x8\n");
+      for (j=0;j<64;j++) {
+        printf("%d ", output[j]);
+        if (j%8 == 7) printf("\n");
+      }
+    }
+#endif
+
+    vpx_memset(input, 0, 128);// test what should i put here
+
+    for (b = 0; b < 4; b++)
+    {
+      for (r = 0; r < 4; r++)
+      {
+          for (c = 0; c < 4; c++)
+          {
+              int a = diff_ptr[c] + pred[c];
+
+              if (a < 0)
+                  a = 0;
+
+              if (a > 255)
+                  a = 255;
+
+              dest[c] = (unsigned char) a;
+          }
+
+          dest += stride;
+          diff_ptr += 8;
+          pred += pitch;
+      }
+      diff_ptr = output + (b+1) / 2 * 4 * 8 + (b+1) % 2 * 4;
+      dest = origdest + (b+1) / 2 * 4 * stride + (b+1) % 2 * 4;
+      pred = origpred + (b+1) / 2 * 4 * pitch + (b+1) % 2 * 4;
+   }
+#ifdef DEC_DEBUG
+    if (dec_debug) {
+      int k,j;
+      printf("Final 8x8\n");
+      for (j=0;j<8;j++) {
+        for (k=0;k<8;k++) {
+          printf("%d ", origdest[k]);
+        }
+        printf("\n");
+        origdest+=stride;
+      }
+    }
+#endif
+}
+
+void vp8_dequant_dc_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
+                               unsigned char *dest, int pitch, int stride,
+                               int Dc)// Dc for 1st order T in some rear case
+{
+    short output[64];
+    short *diff_ptr = output;
+    int r, c, b;
+    int i;
+    unsigned char *origdest = dest;
+    unsigned char *origpred = pred;
+
+    input[0] = (short)Dc;//Dc is the reconstructed value, do not need dequantization
+    //dc value is recovered after dequantization, since dc need not quantization
+#ifdef DEC_DEBUG
+    if (dec_debug) {
+      int j;
+      printf("Input 8x8\n");
+      for (j=0;j<64;j++) {
+        printf("%d ", input[j]);
+        if (j%8 == 7) printf("\n");
+      }
+    }
+#endif
+    for (i = 1; i < 64; i++)
+    {
+        input[i] = input[i] * dq[i!=0];
+    }
+
+#ifdef DEC_DEBUG
+    if (dec_debug) {
+      int j;
+      printf("Input DQ 8x8\n");
+      for (j=0;j<64;j++) {
+        printf("%d ", input[j]);
+        if (j%8 == 7) printf("\n");
+      }
+    }
+#endif
+
+    // the idct halves ( >> 1) the pitch
+    vp8_short_idct8x8_c(input, output,16);
+#ifdef DEC_DEBUG
+    if (dec_debug) {
+      int j;
+      printf("Output 8x8\n");
+      for (j=0;j<64;j++) {
+        printf("%d ", output[j]);
+        if (j%8 == 7) printf("\n");
+      }
+    }
+#endif
+    vpx_memset(input, 0, 128);
+
+    for (b = 0; b < 4; b++)
+    {
+      for (r = 0; r < 4; r++)
+      {
+          for (c = 0; c < 4; c++)
+          {
+              int a = diff_ptr[c] + pred[c];
+
+              if (a < 0)
+                  a = 0;
+
+              if (a > 255)
+                  a = 255;
+
+              dest[c] = (unsigned char) a;
+          }
+
+          dest += stride;
+          diff_ptr += 8;
+          pred += pitch;
+      }
+      diff_ptr = output + (b+1) / 2 * 4 * 8 + (b+1) % 2 * 4;
+      dest = origdest + (b+1) / 2 * 4 * stride + (b+1) % 2 * 4;
+      pred = origpred + (b+1) / 2 * 4 * pitch + (b+1) % 2 * 4;
+   }
+#ifdef DEC_DEBUG
+    if (dec_debug) {
+      int k,j;
+      printf("Final 8x8\n");
+      for (j=0;j<8;j++) {
+        for (k=0;k<8;k++) {
+          printf("%d ", origdest[k]);
+        }
+        printf("\n");
+        origdest+=stride;
+      }
+    }
+#endif
+}
+
+#endif
diff --git a/vp8/decoder/dequantize.h b/vp8/decoder/dequantize.h
index 2e662a5..5ae6f96 100644
--- a/vp8/decoder/dequantize.h
+++ b/vp8/decoder/dequantize.h
@@ -42,6 +42,25 @@
              unsigned char *pre, unsigned char *dst_u, \
              unsigned char *dst_v, int stride, char *eobs)
 
+#if CONFIG_T8X8
+#define prototype_dequant_dc_idct_add_y_block_8x8(sym) \
+    void sym(short *q, short *dq, \
+             unsigned char *pre, unsigned char *dst, \
+             int stride, char *eobs, short *dc, MACROBLOCKD *xd)
+
+#define prototype_dequant_idct_add_y_block_8x8(sym) \
+    void sym(short *q, short *dq, \
+             unsigned char *pre, unsigned char *dst, \
+             int stride, char *eobs, MACROBLOCKD *xd)
+
+#define prototype_dequant_idct_add_uv_block_8x8(sym) \
+    void sym(short *q, short *dq, \
+             unsigned char *pre, unsigned char *dst_u, \
+             unsigned char *dst_v, int stride, char *eobs, \
+             MACROBLOCKD *xd)
+
+#endif
+
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/dequantize_x86.h"
 #endif
@@ -80,6 +99,38 @@
 #endif
 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block);
 
+#if CONFIG_T8X8
+#ifndef vp8_dequant_block_8x8
+#define vp8_dequant_block_8x8 vp8_dequantize_b_8x8_c
+#endif
+extern prototype_dequant_block(vp8_dequant_block_8x8);
+
+#ifndef vp8_dequant_idct_add_8x8
+#define vp8_dequant_idct_add_8x8 vp8_dequant_idct_add_8x8_c
+#endif
+extern prototype_dequant_idct_add(vp8_dequant_idct_add_8x8);
+
+#ifndef vp8_dequant_dc_idct_add_8x8
+#define vp8_dequant_dc_idct_add_8x8 vp8_dequant_dc_idct_add_8x8_c
+#endif
+extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_8x8);
+
+#ifndef vp8_dequant_dc_idct_add_y_block_8x8
+#define vp8_dequant_dc_idct_add_y_block_8x8 vp8_dequant_dc_idct_add_y_block_8x8_c
+#endif
+extern prototype_dequant_dc_idct_add_y_block_8x8(vp8_dequant_dc_idct_add_y_block_8x8);
+
+#ifndef vp8_dequant_idct_add_y_block_8x8
+#define vp8_dequant_idct_add_y_block_8x8 vp8_dequant_idct_add_y_block_8x8_c
+#endif
+extern prototype_dequant_idct_add_y_block_8x8(vp8_dequant_idct_add_y_block_8x8);
+
+#ifndef vp8_dequant_idct_add_uv_block_8x8
+#define vp8_dequant_idct_add_uv_block_8x8 vp8_dequant_idct_add_uv_block_8x8_c
+#endif
+extern prototype_dequant_idct_add_uv_block_8x8(vp8_dequant_idct_add_uv_block_8x8);
+
+#endif
 
 typedef prototype_dequant_block((*vp8_dequant_block_fn_t));
 
@@ -93,6 +144,13 @@
 
 typedef prototype_dequant_idct_add_uv_block((*vp8_dequant_idct_add_uv_block_fn_t));
 
+#if CONFIG_T8X8
+typedef prototype_dequant_dc_idct_add_y_block_8x8((*vp8_dequant_dc_idct_add_y_block_fn_t_8x8));
+
+typedef prototype_dequant_idct_add_y_block_8x8((*vp8_dequant_idct_add_y_block_fn_t_8x8));
+
+typedef prototype_dequant_idct_add_uv_block_8x8((*vp8_dequant_idct_add_uv_block_fn_t_8x8));
+#endif
 typedef struct
 {
     vp8_dequant_block_fn_t               block;
@@ -101,6 +159,14 @@
     vp8_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;
     vp8_dequant_idct_add_y_block_fn_t    idct_add_y_block;
     vp8_dequant_idct_add_uv_block_fn_t   idct_add_uv_block;
+#if CONFIG_T8X8
+    vp8_dequant_block_fn_t               block_8x8;
+    vp8_dequant_idct_add_fn_t            idct_add_8x8;
+    vp8_dequant_dc_idct_add_fn_t         dc_idct_add_8x8;
+    vp8_dequant_dc_idct_add_y_block_fn_t_8x8 dc_idct_add_y_block_8x8;
+    vp8_dequant_idct_add_y_block_fn_t_8x8    idct_add_y_block_8x8;
+    vp8_dequant_idct_add_uv_block_fn_t_8x8   idct_add_uv_block_8x8;
+#endif
 } vp8_dequant_rtcd_vtable_t;
 
 #if CONFIG_RUNTIME_CPU_DETECT
diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c
index 98be685..f8a6e7e 100644
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -26,6 +26,18 @@
     6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
     6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X
 };
+#if CONFIG_T8X8
+DECLARE_ALIGNED(64, static const unsigned char, coef_bands_x_8x8[64]) = {
+  0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 4 * OCB_X, 5 * OCB_X,
+  5 * OCB_X, 3 * OCB_X, 6 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 6 * OCB_X, 6 * OCB_X,
+  6 * OCB_X, 5 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
+  6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
+  6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+};
+#endif
 #define EOB_CONTEXT_NODE            0
 #define ZERO_CONTEXT_NODE           1
 #define ONE_CONTEXT_NODE            2
@@ -44,7 +56,6 @@
 #define CAT4_MIN_VAL   19
 #define CAT5_MIN_VAL   35
 #define CAT6_MIN_VAL   67
-
 #define CAT1_PROB0    159
 #define CAT2_PROB0    145
 #define CAT2_PROB1    165
@@ -157,7 +168,48 @@
         range = range - split; \
         NORMALIZE \
     }
-
+#if CONFIG_T8X8
+#define DECODE_AND_LOOP_IF_ZERO_8x8_2(probability,branch) \
+    { \
+        split = 1 + ((( probability*(range-1) ) ) >> 8); \
+        bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8); \
+        FILL \
+        if ( value < bigsplit ) \
+        { \
+            range = split; \
+            NORMALIZE \
+            Prob = coef_probs; \
+            if(c<3) {\
+            ++c; \
+            Prob += coef_bands_x[c]; \
+            goto branch; \
+            } goto BLOCK_FINISHED_8x8; /*for malformed input */\
+        } \
+        value -= bigsplit; \
+        range = range - split; \
+        NORMALIZE \
+    }
+#define DECODE_AND_LOOP_IF_ZERO_8X8(probability,branch) \
+    { \
+        split = 1 + ((( probability*(range-1) ) ) >> 8); \
+        bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8); \
+        FILL \
+        if ( value < bigsplit ) \
+        { \
+            range = split; \
+            NORMALIZE \
+            Prob = coef_probs; \
+            if(c<63) {\
+            ++c; \
+            Prob += coef_bands_x_8x8[c]; \
+            goto branch; \
+            } goto BLOCK_FINISHED_8x8; /*for malformed input */\
+        } \
+        value -= bigsplit; \
+        range = range - split; \
+        NORMALIZE \
+    }
+#endif
 #define DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val) \
     DECODE_AND_APPLYSIGN(val) \
     Prob = coef_probs + (ENTROPY_NODES*2); \
@@ -168,6 +220,26 @@
     qcoeff_ptr [ 15 ] = (INT16) v; \
     goto BLOCK_FINISHED;
 
+#if CONFIG_T8X8
+#define DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8_2(val) \
+    DECODE_AND_APPLYSIGN(val) \
+    Prob = coef_probs + (ENTROPY_NODES*2); \
+    if(c < 3){\
+        qcoeff_ptr [ scan[c] ] = (INT16) v; \
+        ++c; \
+        goto DO_WHILE_8x8; }\
+    qcoeff_ptr [ scan[3] ] = (INT16) v; \
+    goto BLOCK_FINISHED_8x8;
+#define DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8(val) \
+    DECODE_AND_APPLYSIGN(val) \
+    Prob = coef_probs + (ENTROPY_NODES*2); \
+    if(c < 63){\
+        qcoeff_ptr [ scan[c] ] = (INT16) v; \
+        ++c; \
+        goto DO_WHILE_8x8; }\
+    qcoeff_ptr [ scan[63] ] = (INT16) v; \
+    goto BLOCK_FINISHED_8x8;
+#endif
 
 #define DECODE_EXTRABIT_AND_ADJUST_VAL(prob, bits_count)\
     split = 1 +  (((range-1) * prob) >> 8); \
@@ -185,6 +257,354 @@
     }\
     NORMALIZE
 
+#if CONFIG_T8X8
+int vp8_decode_mb_tokens_8x8(VP8D_COMP *dx, MACROBLOCKD *x)
+{
+    ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;
+    ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)x->left_context;
+    const VP8_COMMON *const oc = & dx->common;
+
+    BOOL_DECODER *bc = x->current_bc;
+
+    char *eobs = x->eobs;
+
+    ENTROPY_CONTEXT *a, *a1;
+    ENTROPY_CONTEXT *l, *l1;
+    int i;
+
+    int eobtotal = 0;
+
+    register int count;
+
+    const BOOL_DATA *bufptr;
+    const BOOL_DATA *bufend;
+    register unsigned int range;
+    VP8_BD_VALUE value;
+    const int *scan;//
+    register unsigned int shift;
+    UINT32 split;
+    VP8_BD_VALUE bigsplit;
+    INT16 *qcoeff_ptr;
+
+    const vp8_prob *coef_probs;//
+    int type;
+    int stop;
+    INT16 val, bits_count;
+    INT16 c;
+    INT16 v;
+    const vp8_prob *Prob;//
+
+    type = 3;
+    i = 0;
+    stop = 16;
+
+    scan = vp8_default_zig_zag1d_8x8;
+    qcoeff_ptr = &x->qcoeff[0];
+
+    if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
+    {
+        i = 24;
+        stop = 24;
+        type = 1;
+        qcoeff_ptr += 24*16;
+        eobtotal -= 4;
+        scan = vp8_default_zig_zag1d;
+    }
+
+    bufend  = bc->user_buffer_end;
+    bufptr  = bc->user_buffer;
+    value   = bc->value;
+    count   = bc->count;
+    range   = bc->range;
+
+    coef_probs = oc->fc.coef_probs_8x8 [type] [ 0 ] [0];
+
+BLOCK_LOOP_8x8:
+    a = A + vp8_block2above[i];
+    l = L + vp8_block2left[i];
+
+    if(i < 16)
+    {
+       a1 = A + vp8_block2above[i+1];
+       l1 = L + vp8_block2left[i+4];
+    }
+    else if(i<24)
+    {
+      a1 = A + vp8_block2above[i+1];
+      l1 = L + vp8_block2left[i+2];
+
+    }
+    c = (INT16)(!type);
+
+//    Dest = ((A)!=0) + ((B)!=0);
+    if(i==24)
+    {
+      VP8_COMBINEENTROPYCONTEXTS(v, *a, *l);
+    }
+    else
+    {
+      VP8_COMBINEENTROPYCONTEXTS_8x8(v, *a, *l, *a1, *l1);
+    }
+
+    Prob = coef_probs;
+    Prob += v * ENTROPY_NODES;
+
+DO_WHILE_8x8:
+    if(i==24)
+      Prob += coef_bands_x[c];
+    else
+      Prob += coef_bands_x_8x8[c];
+    DECODE_AND_BRANCH_IF_ZERO(Prob[EOB_CONTEXT_NODE], BLOCK_FINISHED_8x8);
+
+CHECK_0_8x8_:
+    if (i==24)
+    {
+      DECODE_AND_LOOP_IF_ZERO_8x8_2(Prob[ZERO_CONTEXT_NODE], CHECK_0_8x8_);
+    }
+    else
+    {
+      DECODE_AND_LOOP_IF_ZERO_8X8(Prob[ZERO_CONTEXT_NODE], CHECK_0_8x8_);
+    }
+    DECODE_AND_BRANCH_IF_ZERO(Prob[ONE_CONTEXT_NODE], ONE_CONTEXT_NODE_0_8x8_);
+    DECODE_AND_BRANCH_IF_ZERO(Prob[LOW_VAL_CONTEXT_NODE], LOW_VAL_CONTEXT_NODE_0_8x8_);
+    DECODE_AND_BRANCH_IF_ZERO(Prob[HIGH_LOW_CONTEXT_NODE], HIGH_LOW_CONTEXT_NODE_0_8x8_);
+    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREEFOUR_CONTEXT_NODE], CAT_THREEFOUR_CONTEXT_NODE_0_8x8_);
+    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_FIVE_CONTEXT_NODE], CAT_FIVE_CONTEXT_NODE_0_8x8_);
+    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY6].min_val;
+    bits_count = vp8d_token_extra_bits2[DCT_VAL_CATEGORY6].Length;
+
+    do
+    {
+        DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY6, bits_count);
+        bits_count -- ;
+    }
+    while (bits_count >= 0);
+    if(i==24)
+    {
+        DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8_2(val);
+    }
+    else
+    {
+        DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8(val);
+    }
+
+CAT_FIVE_CONTEXT_NODE_0_8x8_:
+    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY5].min_val;
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 4);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 3);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 2);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 1);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 0);
+    if(i==24)
+    {
+        DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8_2(val);
+    }
+    else
+    {
+        DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8(val);
+    }
+
+CAT_THREEFOUR_CONTEXT_NODE_0_8x8_:
+    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREE_CONTEXT_NODE], CAT_THREE_CONTEXT_NODE_0_8x8_);
+    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY4].min_val;
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 3);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 2);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 1);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 0);
+    if(i==24)
+    {
+        DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8_2(val);
+    }
+    else
+    {
+        DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8(val);
+    }
+
+CAT_THREE_CONTEXT_NODE_0_8x8_:
+    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY3].min_val;
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 2);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 1);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 0);
+    if(i==24)
+    {
+        DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8_2(val);
+    }
+    else
+    {
+        DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8(val);
+    }
+
+HIGH_LOW_CONTEXT_NODE_0_8x8_:
+    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_ONE_CONTEXT_NODE], CAT_ONE_CONTEXT_NODE_0_8x8_);
+
+    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY2].min_val;
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY2, 1);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY2, 0);
+    if(i==24)
+    {
+        DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8_2(val);
+    }
+    else
+    {
+        DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8(val);
+    }
+
+CAT_ONE_CONTEXT_NODE_0_8x8_:
+    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY1].min_val;
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY1, 0);
+    if(i==24)
+    {
+        DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8_2(val);
+    }
+    else
+    {
+        DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8(val);
+    }
+
+LOW_VAL_CONTEXT_NODE_0_8x8_:
+    DECODE_AND_BRANCH_IF_ZERO(Prob[TWO_CONTEXT_NODE], TWO_CONTEXT_NODE_0_8x8_);
+    DECODE_AND_BRANCH_IF_ZERO(Prob[THREE_CONTEXT_NODE], THREE_CONTEXT_NODE_0_8x8_);
+    if(i==24)
+    {
+        DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8_2(4);
+    }
+    else
+    {
+        DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8(4);
+    }
+
+
+THREE_CONTEXT_NODE_0_8x8_:
+    if(i==24)
+    {
+        DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8_2(3);
+    }
+    else
+    {
+        DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8(3);
+    }
+
+
+TWO_CONTEXT_NODE_0_8x8_:
+    if(i==24)
+    {
+        DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8_2(2);
+    }
+    else
+    {
+        DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT_8x8(2);
+    }
+
+
+ONE_CONTEXT_NODE_0_8x8_:
+    DECODE_AND_APPLYSIGN(1);
+    Prob = coef_probs + ENTROPY_NODES;
+
+    if (i==24)
+    {
+      if (c < 3)//15
+      {
+        qcoeff_ptr [ scan[c] ] = (INT16) v;
+        ++c;
+        goto DO_WHILE_8x8;
+      }
+    }
+    else
+    {
+      if (c < 63)
+      {
+        qcoeff_ptr [ scan[c] ] = (INT16) v;
+        ++c;
+        goto DO_WHILE_8x8;
+      }
+    }
+
+   if(i==24)
+       qcoeff_ptr [ scan[3] ] = (INT16) v;//15
+   else
+       qcoeff_ptr [ scan[63] ] = (INT16) v;
+
+
+BLOCK_FINISHED_8x8:
+    *a = *l = ((eobs[i] = c) != !type);   // any nonzero data?
+    /*if (i!=24) {
+      *(A + vp8_block2above[i+1]) = *(A + vp8_block2above[i+2]) = *(A + vp8_block2above[i+3]) = *a;
+      *(L + vp8_block2left[i+1]) = *(L + vp8_block2left[i+2]) = *(L + vp8_block2left[i+3]) = *l;
+    }*/
+
+    if (i!=24)
+    {
+      if(i==0)
+      {
+        *(A + vp8_block2above[1]) = *(A + vp8_block2above[4]) = *(A + vp8_block2above[5]) = *a;
+        *(L + vp8_block2left[1]) = *(L + vp8_block2left[4]) = *(L + vp8_block2left[5]) = *l;
+      }
+      else if(i==4)
+      {
+        *(A + vp8_block2above[2]) = *(A + vp8_block2above[3]) = *(A + vp8_block2above[6]) = *(A + vp8_block2above[7]) = *a;
+        *(L + vp8_block2left[2]) = *(L + vp8_block2left[3]) = *(L + vp8_block2left[6]) = *(L + vp8_block2left[7]) = *l;
+        *(A + vp8_block2above[4]) = *(A + vp8_block2above[1]);
+        *(L + vp8_block2left[4]) = *(L + vp8_block2left[1]);
+      }
+      else if(i==8)
+      {
+        *(A + vp8_block2above[9]) = *(A + vp8_block2above[12]) = *(A + vp8_block2above[13]) = *a;
+        *(L + vp8_block2left[9]) = *(L + vp8_block2left[12]) = *(L + vp8_block2left[13]) = *l;
+
+      }
+      else if(i==12)
+      {
+        *(A + vp8_block2above[10]) = *(A + vp8_block2above[11]) = *(A + vp8_block2above[14]) = *(A + vp8_block2above[15]) = *a;
+        *(L + vp8_block2left[10]) = *(L + vp8_block2left[11]) = *(L + vp8_block2left[14]) = *(L + vp8_block2left[15]) = *l;
+        *(A + vp8_block2above[12]) = *(A + vp8_block2above[8]);
+        *(L + vp8_block2left[12]) = *(L + vp8_block2left[8]);
+
+      }
+      else
+      {
+        *(A + vp8_block2above[i+1]) = *(A + vp8_block2above[i+2]) = *(A + vp8_block2above[i+3]) = *a;
+        *(L + vp8_block2left[i+1]) = *(L + vp8_block2left[i+2]) = *(L + vp8_block2left[i+3]) = *l;
+
+      }
+    }
+
+    eobtotal += c;
+    qcoeff_ptr += (i==24 ? 16 : 64);
+
+    i+=4;
+
+    if (i < stop)
+        goto BLOCK_LOOP_8x8;
+
+    if (i > 24)
+    {
+        type = 0;
+        i = 0;
+        stop = 16;
+        coef_probs = oc->fc.coef_probs_8x8 [type] [ 0 ] [0];
+        qcoeff_ptr -= (24*16 + 16);
+        scan = vp8_default_zig_zag1d_8x8;
+        goto BLOCK_LOOP_8x8;
+    }
+
+    if (i == 16)
+    {
+        type = 2;
+        coef_probs = oc->fc.coef_probs_8x8 [type] [ 0 ] [0];
+        stop = 24;
+        goto BLOCK_LOOP_8x8;
+    }
+
+    FILL
+    bc->user_buffer = bufptr;
+    bc->value = value;
+    bc->count = count;
+    bc->range = range;
+
+    return eobtotal;
+
+}
+#endif
 int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
 {
     ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;
@@ -275,7 +695,7 @@
                               CAT_FIVE_CONTEXT_NODE_0_);
 
     val = CAT6_MIN_VAL;
-    bits_count = CONFIG_EXTEND_QRANGE?12:10;
+    bits_count = CONFIG_EXTEND_QRANGE?14:12;
 
     do
     {
@@ -382,6 +802,7 @@
     bc->value = value;
     bc->count = count;
     bc->range = range;
+
     return eobtotal;
 
 }
diff --git a/vp8/decoder/detokenize.h b/vp8/decoder/detokenize.h
index 8640bda..c5305bb 100644
--- a/vp8/decoder/detokenize.h
+++ b/vp8/decoder/detokenize.h
@@ -16,5 +16,8 @@
 
 void vp8_reset_mb_tokens_context(MACROBLOCKD *x);
 int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);
+#if CONFIG_T8X8
+int vp8_decode_mb_tokens_8x8(VP8D_COMP *, MACROBLOCKD *);
+#endif
 
 #endif /* DETOKENIZE_H */
diff --git a/vp8/decoder/generic/dsystemdependent.c b/vp8/decoder/generic/dsystemdependent.c
index f766532..fc5fdb3 100644
--- a/vp8/decoder/generic/dsystemdependent.c
+++ b/vp8/decoder/generic/dsystemdependent.c
@@ -21,6 +21,17 @@
     /* Pure C: */
 #if CONFIG_RUNTIME_CPU_DETECT
     pbi->mb.rtcd                     = &pbi->common.rtcd;
+
+#if CONFIG_T8X8
+
+    pbi->dequant.block_8x8           = vp8_dequantize_b_8x8_c;
+    pbi->dequant.idct_add_8x8        = vp8_dequant_idct_add_8x8_c;
+    pbi->dequant.dc_idct_add_8x8     = vp8_dequant_dc_idct_add_8x8_c;
+    pbi->dequant.dc_idct_add_y_block_8x8 = vp8_dequant_dc_idct_add_y_block_8x8_c;
+    pbi->dequant.idct_add_y_block_8x8 = vp8_dequant_idct_add_y_block_8x8_c;
+    pbi->dequant.idct_add_uv_block_8x8 = vp8_dequant_idct_add_uv_block_8x8_c;
+
+#endif
     pbi->dequant.block               = vp8_dequantize_b_c;
     pbi->dequant.idct_add            = vp8_dequant_idct_add_c;
     pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_c;
diff --git a/vp8/decoder/idct_blk.c b/vp8/decoder/idct_blk.c
index df01923..2015d52 100644
--- a/vp8/decoder/idct_blk.c
+++ b/vp8/decoder/idct_blk.c
@@ -122,3 +122,45 @@
         dstv += 4*stride - 8;
     }
 }
+
+#if CONFIG_T8X8
+void vp8_dequant_dc_idct_add_y_block_8x8_c
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dst, int stride, char *eobs, short *dc, MACROBLOCKD *xd)
+{
+
+     vp8_dequant_dc_idct_add_8x8_c (q, dq, pre, dst, 16, stride, dc[0]);
+     vp8_dequant_dc_idct_add_8x8_c (&q[64], dq, pre+8, dst+8, 16, stride, dc[1]);
+     vp8_dequant_dc_idct_add_8x8_c (&q[128], dq, pre+8*16, dst+8*stride, 16, stride, dc[4]);
+     vp8_dequant_dc_idct_add_8x8_c (&q[192], dq, pre+8*16+8, dst+8*stride+8, 16, stride, dc[8]);
+
+}
+
+void vp8_dequant_idct_add_y_block_8x8_c
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dst, int stride, char *eobs, MACROBLOCKD *xd)
+{
+
+
+  unsigned char *origdest = dst;
+  unsigned char *origpred = pre;
+
+  vp8_dequant_idct_add_8x8_c (q, dq, pre, dst, 16, stride);
+  vp8_dequant_idct_add_8x8_c (&q[64], dq, origpred+8, origdest+8, 16, stride);
+  vp8_dequant_idct_add_8x8_c (&q[128], dq, origpred+8*16, origdest+8*stride, 16, stride);
+  vp8_dequant_idct_add_8x8_c (&q[192], dq, origpred+8*16+8, origdest+8*stride+8, 16, stride);
+
+}
+
+void vp8_dequant_idct_add_uv_block_8x8_c
+            (short *q, short *dq, unsigned char *pre,
+             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs, MACROBLOCKD *xd)
+{
+  vp8_dequant_idct_add_8x8_c (q, dq, pre, dstu, 8, stride);
+
+  q    += 64;
+  pre  += 64;
+
+  vp8_dequant_idct_add_8x8_c (q, dq, pre, dstv, 8, stride);
+}
+#endif
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 2246194..71f3232 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -43,6 +43,43 @@
 static int get_free_fb (VP8_COMMON *cm);
 static void ref_cnt_fb (int *buf, int *idx, int new_idx);
 
+#if CONFIG_DEBUG
+void vp8_recon_write_yuv_frame(char *name, YV12_BUFFER_CONFIG *s)
+{
+    FILE *yuv_file = fopen((char *)name, "ab");
+    unsigned char *src = s->y_buffer;
+    int h = s->y_height;
+
+    do
+    {
+        fwrite(src, s->y_width, 1,  yuv_file);
+        src += s->y_stride;
+    }
+    while (--h);
+
+    src = s->u_buffer;
+    h = s->uv_height;
+
+    do
+    {
+        fwrite(src, s->uv_width, 1,  yuv_file);
+        src += s->uv_stride;
+    }
+    while (--h);
+
+    src = s->v_buffer;
+    h = s->uv_height;
+
+    do
+    {
+        fwrite(src, s->uv_width, 1, yuv_file);
+        src += s->uv_stride;
+    }
+    while (--h);
+
+    fclose(yuv_file);
+}
+#endif
 
 void vp8dx_initialize()
 {
@@ -56,7 +93,6 @@
     }
 }
 
-
 VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
 {
     VP8D_COMP *pbi = vpx_memalign(32, sizeof(VP8D_COMP));
@@ -107,13 +143,17 @@
     return (VP8D_PTR) pbi;
 }
 
-
 void vp8dx_remove_decompressor(VP8D_PTR ptr)
 {
     VP8D_COMP *pbi = (VP8D_COMP *) ptr;
 
     if (!pbi)
         return;
+#if CONFIG_SEGMENTATION
+     // Delete sementation map
+    if (pbi->segmentation_map != 0)
+        vpx_free(pbi->segmentation_map);
+#endif
 
 #if CONFIG_MULTITHREAD
     if (pbi->b_multithreaded_rd)
@@ -288,6 +328,22 @@
     return err;
 }
 
+/*
+static void vp8_print_yuv_rec_mb(VP8_COMMON *cm, int mb_row, int mb_col)
+{
+  YV12_BUFFER_CONFIG *s = cm->frame_to_show;
+  unsigned char *src = s->y_buffer;
+  int i, j;
+
+  printf("After loop filter\n");
+  for (i=0;i<16;i++) {
+    for (j=0;j<16;j++)
+      printf("%3d ", src[(mb_row*16+i)*s->y_stride + mb_col*16+j]);
+    printf("\n");
+  }
+}
+*/
+
 int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsigned char *source, int64_t time_stamp)
 {
 #if HAVE_ARMV7
@@ -455,6 +511,9 @@
         vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
     }
 
+#if CONFIG_DEBUG
+    vp8_recon_write_yuv_frame("recon.yuv", cm->frame_to_show);
+#endif
 
     vp8_clear_system_state();
 
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index eac57ab..a7d8b5a 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -21,6 +21,8 @@
 #include "ec_types.h"
 #endif
 
+//#define DEC_DEBUG
+
 typedef struct
 {
     int ithread;
@@ -45,10 +47,16 @@
 typedef struct
 {
     int const *scan;
+#if CONFIG_T8X8
+    int const *scan_8x8;
+#endif
     UINT8 const *ptr_block2leftabove;
     vp8_tree_index const *vp8_coef_tree_ptr;
     unsigned char *norm_ptr;
     UINT8 *ptr_coef_bands_x;
+#if CONFIG_T8X8
+    UINT8 *ptr_coef_bands_x_8x8;
+#endif
 
     ENTROPY_CONTEXT_PLANES *A;
     ENTROPY_CONTEXT_PLANES *L;
@@ -57,6 +65,9 @@
     BOOL_DECODER *current_bc;
 
     vp8_prob const *coef_probs[4];
+#if CONFIG_T8X8
+    vp8_prob const *coef_probs_8x8[4];
+#endif
 
     UINT8 eob[25];
 
@@ -78,7 +89,9 @@
     const unsigned char *partitions[MAX_PARTITIONS];
     unsigned int   partition_sizes[MAX_PARTITIONS];
     unsigned int   num_partitions;
-
+#if CONFIG_SEGMENTATION
+    unsigned char *segmentation_map;
+#endif
 #if CONFIG_MULTITHREAD
     /* variable for threading */
 
@@ -87,7 +100,6 @@
     int current_mb_col_main;
     int decoding_thread_count;
     int allocated_decoding_thread_count;
-
     int mt_baseline_filter_level[MAX_MB_SEGMENTS];
     int sync_range;
     int *mt_current_mb_col;                  /* Each row remembers its already decoded column. */
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index b3c2439..64d1c93 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -23,7 +23,9 @@
 #include "vpx_mem/vpx_mem.h"
 #include "bitstream.h"
 #include "vp8/common/defaultcoefcounts.h"
-
+#if CONFIG_SEGMENTATION
+static int segment_cost = 0;
+#endif
 const int vp8cx_base_skip_false_prob[128] =
 {
     255, 255, 255, 255, 255, 255, 255, 255,
@@ -51,11 +53,19 @@
 #ifdef ENTROPY_STATS
 int intra_mode_stats[10][10][10];
 static unsigned int tree_update_hist [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] [2];
+#if CONFIG_T8X8
+static unsigned int tree_update_hist_8x8 [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] [2];
+#endif
+
 extern unsigned int active_section;
 #endif
 
 #ifdef MODE_STATS
 int count_mb_seg[4] = { 0, 0, 0, 0 };
+#if CONFIG_SEGMENTATION
+int segment_modes_intra[MAX_MB_SEGMENTS] = { 0, 0, 0, 0 };
+int segment_modes_inter[MAX_MB_SEGMENTS] = { 0, 0, 0, 0 };
+#endif
 #endif
 
 
@@ -812,24 +822,39 @@
         case 0:
             vp8_write(w, 0, x->mb_segment_tree_probs[0]);
             vp8_write(w, 0, x->mb_segment_tree_probs[1]);
+#if CONFIG_SEGMENTATION
+            segment_cost += vp8_cost_zero(x->mb_segment_tree_probs[0]) + vp8_cost_zero(x->mb_segment_tree_probs[1]);
+#endif
             break;
         case 1:
             vp8_write(w, 0, x->mb_segment_tree_probs[0]);
             vp8_write(w, 1, x->mb_segment_tree_probs[1]);
+#if CONFIG_SEGMENTATION
+            segment_cost += vp8_cost_zero(x->mb_segment_tree_probs[0]) + vp8_cost_one(x->mb_segment_tree_probs[1]);
+#endif
             break;
         case 2:
             vp8_write(w, 1, x->mb_segment_tree_probs[0]);
             vp8_write(w, 0, x->mb_segment_tree_probs[2]);
+#if CONFIG_SEGMENTATION
+            segment_cost += vp8_cost_one(x->mb_segment_tree_probs[0]) + vp8_cost_zero(x->mb_segment_tree_probs[2]);
+#endif
             break;
         case 3:
             vp8_write(w, 1, x->mb_segment_tree_probs[0]);
             vp8_write(w, 1, x->mb_segment_tree_probs[2]);
+#if CONFIG_SEGMENTATION
+            segment_cost += vp8_cost_one(x->mb_segment_tree_probs[0]) + vp8_cost_one(x->mb_segment_tree_probs[2]);
+#endif
             break;
 
             // TRAP.. This should not happen
         default:
             vp8_write(w, 0, x->mb_segment_tree_probs[0]);
             vp8_write(w, 0, x->mb_segment_tree_probs[1]);
+#if CONFIG_SEGMENTATION
+            segment_cost += vp8_cost_zero(x->mb_segment_tree_probs[0]) + vp8_cost_zero(x->mb_segment_tree_probs[1]);
+#endif
             break;
         }
     }
@@ -841,7 +866,13 @@
     VP8_COMMON *const pc = & cpi->common;
     vp8_writer *const w = & cpi->bc;
     const MV_CONTEXT *mvc = pc->fc.mvc;
-
+    MACROBLOCKD *xd = &cpi->mb.e_mbd;
+#if CONFIG_SEGMENTATION
+    int left_id, above_id;
+    int i;
+    int sum;
+    int index = 0;
+#endif
     const int *const rfct = cpi->count_mb_ref_frame_usage;
     const int rf_intra = rfct[INTRA_FRAME];
     const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
@@ -898,7 +929,9 @@
     update_mbintra_mode_probs(cpi);
 
     vp8_write_mvprobs(cpi);
-
+#if CONFIG_SEGMENTATION
+    vp8_write_bit(w, (xd->temporal_update) ? 1:0);
+#endif
     while (++mb_row < pc->mb_rows)
     {
         int mb_col = -1;
@@ -909,7 +942,7 @@
             const MV_REFERENCE_FRAME rf = mi->ref_frame;
             const MB_PREDICTION_MODE mode = mi->mode;
 
-            MACROBLOCKD *xd = &cpi->mb.e_mbd;
+            //MACROBLOCKD *xd = &cpi->mb.e_mbd;
 
             // Distance of Mb to the various image edges.
             // These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units
@@ -917,13 +950,53 @@
             xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
             xd->mb_to_top_edge = -((mb_row * 16)) << 3;
             xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
-
+#if CONFIG_SEGMENTATION
+            xd->up_available = (mb_row != 0);
+            xd->left_available = (mb_col != 0);
+#endif
 #ifdef ENTROPY_STATS
             active_section = 9;
 #endif
 
+#ifdef MODE_STATS
+#if CONFIG_SEGMENTATION
+            segment_modes_inter[mi->segment_id]++;
+#endif
+#endif
             if (cpi->mb.e_mbd.update_mb_segmentation_map)
+            {
+#if CONFIG_SEGMENTATION
+                if (xd->temporal_update)
+                {
+                    sum = 0;
+                    if (mb_col != 0)
+                        sum +=  (m-1)->mbmi.segment_flag;
+                    if (mb_row != 0)
+                        sum += (m-pc->mb_cols)->mbmi.segment_flag;
+
+                    if (m->mbmi.segment_flag == 0)
+                    {
+                        vp8_write(w,0,xd->mb_segment_tree_probs[3+sum]);
+                        segment_cost += vp8_cost_zero(xd->mb_segment_tree_probs[3+sum]);
+                    }
+                    else
+                    {
+                        vp8_write(w,1,xd->mb_segment_tree_probs[3+sum]);
+                        segment_cost += vp8_cost_one(xd->mb_segment_tree_probs[3+sum]);
+                        write_mb_features(w, mi, &cpi->mb.e_mbd);
+                        cpi->segmentation_map[index] = mi->segment_id;
+                    }
+                }
+                else
+                {
+                    write_mb_features(w, mi, &cpi->mb.e_mbd);
+                    cpi->segmentation_map[index] = mi->segment_id;
+                }
+                index++;
+#else
                 write_mb_features(w, mi, &cpi->mb.e_mbd);
+#endif
+            }
 
             if (pc->mb_no_coeff_skip)
                 vp8_encode_bool(w, m->mbmi.mb_skip_coeff, prob_skip_false);
@@ -1058,7 +1131,11 @@
     const VP8_COMMON *const c = & cpi->common;
     /* const */
     MODE_INFO *m = c->mi;
-
+#if CONFIG_SEGMENTATION
+    int left_id, above_id;
+    int i;
+    int index = 0;
+#endif
     int mb_row = -1;
     int prob_skip_false = 0;
 
@@ -1083,9 +1160,28 @@
         while (++mb_col < c->mb_cols)
         {
             const int ym = m->mbmi.mode;
+#if CONFIG_SEGMENTATION
+            MACROBLOCKD *xd = &cpi->mb.e_mbd;
+            xd->up_available = (mb_row != 0);
+            xd->left_available = (mb_col != 0);
+#endif
+#ifdef MODE_STATS
+#if CONFIG_SEGMENTATION
+            segment_modes_intra[m->mbmi.segment_id]++;
+#endif
+#endif
 
             if (cpi->mb.e_mbd.update_mb_segmentation_map)
+            {
+#if CONFIG_SEGMENTATION
+
                 write_mb_features(bc, &m->mbmi, &cpi->mb.e_mbd);
+                cpi->segmentation_map[index] = m->mbmi.segment_id;
+                index++;
+#else
+                write_mb_features(bc, &m->mbmi, &cpi->mb.e_mbd);
+#endif
+            }
 
             if (c->mb_no_coeff_skip)
                 vp8_encode_bool(bc, m->mbmi.mb_skip_coeff, prob_skip_false);
@@ -1314,6 +1410,7 @@
 int vp8_estimate_entropy_savings(VP8_COMP *cpi)
 {
     int savings = 0;
+    int i=0;
 
     const int *const rfct = cpi->count_mb_ref_frame_usage;
     const int rf_intra = rfct[INTRA_FRAME];
@@ -1378,6 +1475,65 @@
         savings += default_coef_context_savings(cpi);
 
 
+#if CONFIG_T8X8
+    i = 0;
+    do
+    {
+        int j = 0;
+
+        do
+        {
+            int k = 0;
+
+            do
+            {
+                /* at every context */
+
+                /* calc probs and branch cts for this frame only */
+                //vp8_prob new_p           [ENTROPY_NODES];
+                //unsigned int branch_ct   [ENTROPY_NODES] [2];
+
+                int t = 0;      /* token/prob index */
+
+                vp8_tree_probs_from_distribution(
+                    MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
+                    cpi->frame_coef_probs_8x8 [i][j][k], cpi->frame_branch_ct_8x8 [i][j][k], cpi->coef_counts_8x8 [i][j][k],
+                    256, 1
+                );
+
+                do
+                {
+                    const unsigned int *ct  = cpi->frame_branch_ct_8x8 [i][j][k][t];
+                    const vp8_prob newp = cpi->frame_coef_probs_8x8 [i][j][k][t];
+
+                    const vp8_prob old = cpi->common.fc.coef_probs_8x8 [i][j][k][t];
+                    const vp8_prob upd = vp8_coef_update_probs_8x8 [i][j][k][t];
+
+                    const int old_b = vp8_cost_branch(ct, old);
+                    const int new_b = vp8_cost_branch(ct, newp);
+
+                    const int update_b = 8 +
+                                         ((vp8_cost_one(upd) - vp8_cost_zero(upd)) >> 8);
+
+                    const int s = old_b - new_b - update_b;
+
+                    if (s > 0)
+                        savings += s;
+
+
+                }
+                while (++t < MAX_ENTROPY_TOKENS - 1);
+
+
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
+#endif
+
+
     return savings;
 }
 
@@ -1504,6 +1660,92 @@
     }
     while (++i < BLOCK_TYPES);
 
+#if CONFIG_T8X8
+    i = 0;
+    do
+    {
+        int j = 0;
+
+        do
+        {
+            int k = 0;
+
+            do
+            {
+                //note: use result from vp8_estimate_entropy_savings, so no need to call vp8_tree_probs_from_distribution here.
+                /* at every context */
+
+                /* calc probs and branch cts for this frame only */
+                //vp8_prob new_p           [ENTROPY_NODES];
+                //unsigned int branch_ct   [ENTROPY_NODES] [2];
+
+                int t = 0;      /* token/prob index */
+
+                //vp8_tree_probs_from_distribution(
+                //    MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
+                //    new_p, branch_ct, (unsigned int *)cpi->coef_counts [i][j][k],
+                //    256, 1
+                //    );
+
+                do
+                {
+                    const unsigned int *ct  = cpi->frame_branch_ct_8x8 [i][j][k][t];
+                    const vp8_prob newp = cpi->frame_coef_probs_8x8 [i][j][k][t];
+
+                    vp8_prob *Pold = cpi->common.fc.coef_probs_8x8 [i][j][k] + t;
+                    const vp8_prob old = *Pold;
+                    const vp8_prob upd = vp8_coef_update_probs_8x8 [i][j][k][t];
+
+                    const int old_b = vp8_cost_branch(ct, old);
+                    const int new_b = vp8_cost_branch(ct, newp);
+
+                    const int update_b = 8 +
+                                         ((vp8_cost_one(upd) - vp8_cost_zero(upd)) >> 8);
+
+                    const int s = old_b - new_b - update_b;
+                    const int u = s > 0 ? 1 : 0;
+
+                    vp8_write(w, u, upd);
+
+
+#ifdef ENTROPY_STATS
+                    ++ tree_update_hist_8x8 [i][j][k][t] [u];
+#endif
+
+                    if (u)
+                    {
+                        /* send/use new probability */
+
+                        *Pold = newp;
+                        vp8_write_literal(w, newp, 8);
+
+                        savings += s;
+
+                    }
+
+                }
+                while (++t < MAX_ENTROPY_TOKENS - 1);
+
+                /* Accum token counts for generation of default statistics */
+#ifdef ENTROPY_STATS
+                t = 0;
+
+                do
+                {
+                    context_counters_8x8 [i][j][k][t] += cpi->coef_counts_8x8 [i][j][k][t];
+                }
+                while (++t < MAX_ENTROPY_TOKENS);
+
+#endif
+
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
+#endif
+
 }
 #ifdef PACKET_TESTING
 FILE *vpxlogc = 0;
@@ -1584,8 +1826,9 @@
     }
     else
         vp8_start_encode(bc, cx_data);
-
-
+#if CONFIG_SEGMENTATION
+    xd->update_mb_segmentation_map = 1;
+#endif
     // Signal whether or not Segmentation is enabled
     vp8_write_bit(bc, (xd->segmentation_enabled) ? 1 : 0);
 
@@ -1635,8 +1878,12 @@
 
         if (xd->update_mb_segmentation_map)
         {
+ #if CONFIG_SEGMENTATION
             // Write the probs used to decode the segment id for each macro block.
+            for (i = 0; i < MB_FEATURE_TREE_PROBS+3; i++)
+#else
             for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
+#endif
             {
                 int Data = xd->mb_segment_tree_probs[i];
 
@@ -1908,6 +2155,46 @@
     }
 
     fprintf(f, "};\n");
+
+#if CONFIG_T8X8
+    fprintf(f, "const vp8_prob tree_update_probs_8x8[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {\n");
+
+    for (i = 0; i < BLOCK_TYPES; i++)
+    {
+        fprintf(f, "  { \n");
+
+        for (j = 0; j < COEF_BANDS; j++)
+        {
+            fprintf(f, "    {\n");
+
+            for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+            {
+                fprintf(f, "      {");
+
+                for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++)
+                {
+                    Sum = tree_update_hist_8x8[i][j][k][l][0] + tree_update_hist_8x8[i][j][k][l][1];
+
+                    if (Sum > 0)
+                    {
+                        if (((tree_update_hist_8x8[i][j][k][l][0] * 255) / Sum) > 0)
+                            fprintf(f, "%3ld, ", (tree_update_hist_8x8[i][j][k][l][0] * 255) / Sum);
+                        else
+                            fprintf(f, "%3ld, ", 1);
+                    }
+                    else
+                        fprintf(f, "%3ld, ", 128);
+                }
+
+                fprintf(f, "},\n");
+            }
+
+            fprintf(f, "    },\n");
+        }
+
+        fprintf(f, "  },\n");
+    }
+#endif
     fclose(f);
 }
 #endif
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index 0d14b54..8a95db7 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -46,7 +46,7 @@
     int src;
     int src_stride;
 
-//  MV  enc_mv;
+    //  MV  enc_mv;
     int force_empty;
 
 } BLOCK;
@@ -126,6 +126,12 @@
     void (*short_walsh4x4)(short *input, short *output, int pitch);
     void (*quantize_b)(BLOCK *b, BLOCKD *d);
     void (*quantize_b_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);
+ #if CONFIG_T8X8
+    void (*vp8_short_fdct8x8)(short *input, short *output, int pitch);
+    void (*short_fhaar2x2)(short *input, short *output, int pitch);
+    void (*quantize_b_8x8)(BLOCK *b, BLOCKD *d);
+    void (*quantize_b_2x2)(BLOCK *b, BLOCKD *d);
+#endif
 
 } MACROBLOCK;
 
diff --git a/vp8/encoder/dct.c b/vp8/encoder/dct.c
index 69a882c..fd4c62c 100644
--- a/vp8/encoder/dct.c
+++ b/vp8/encoder/dct.c
@@ -11,6 +11,122 @@
 
 #include <math.h>
 #include "vpx_ports/config.h"
+
+
+
+
+
+#if CONFIG_T8X8
+void vp8_short_fdct8x8_c(short *block, short *coefs, int pitch)
+{
+  int j1, i, j, k;
+  float b[8];
+  float b1[8];
+  float d[8][8];
+  float f0 = (float) .7071068;
+  float f1 = (float) .4903926;
+  float f2 = (float) .4619398;
+  float f3 = (float) .4157348;
+  float f4 = (float) .3535534;
+  float f5 = (float) .2777851;
+  float f6 = (float) .1913417;
+  float f7 = (float) .0975452;
+  pitch = pitch / 2;
+  for (i = 0, k = 0; i < 8; i++, k += pitch)
+  {
+    for (j = 0; j < 8; j++)
+    {
+      b[j] = (float)( block[k + j]<<1);
+    }
+    /* Horizontal transform */
+    for (j = 0; j < 4; j++)
+    {
+      j1 = 7 - j;
+      b1[j] = b[j] + b[j1];
+      b1[j1] = b[j] - b[j1];
+    }
+    b[0] = b1[0] + b1[3];
+    b[1] = b1[1] + b1[2];
+    b[2] = b1[1] - b1[2];
+    b[3] = b1[0] - b1[3];
+    b[4] = b1[4];
+    b[5] = (b1[6] - b1[5]) * f0;
+    b[6] = (b1[6] + b1[5]) * f0;
+    b[7] = b1[7];
+    d[i][0] = (b[0] + b[1]) * f4;
+    d[i][4] = (b[0] - b[1]) * f4;
+    d[i][2] = b[2] * f6 + b[3] * f2;
+    d[i][6] = b[3] * f6 - b[2] * f2;
+    b1[4] = b[4] + b[5];
+    b1[7] = b[7] + b[6];
+    b1[5] = b[4] - b[5];
+    b1[6] = b[7] - b[6];
+    d[i][1] = b1[4] * f7 + b1[7] * f1;
+    d[i][5] = b1[5] * f3 + b1[6] * f5;
+    d[i][7] = b1[7] * f7 - b1[4] * f1;
+    d[i][3] = b1[6] * f3 - b1[5] * f5;
+  }
+  /* Vertical transform */
+  for (i = 0; i < 8; i++)
+  {
+    for (j = 0; j < 4; j++)
+    {
+      j1 = 7 - j;
+      b1[j] = d[j][i] + d[j1][i];
+      b1[j1] = d[j][i] - d[j1][i];
+    }
+    b[0] = b1[0] + b1[3];
+    b[1] = b1[1] + b1[2];
+    b[2] = b1[1] - b1[2];
+    b[3] = b1[0] - b1[3];
+    b[4] = b1[4];
+    b[5] = (b1[6] - b1[5]) * f0;
+    b[6] = (b1[6] + b1[5]) * f0;
+    b[7] = b1[7];
+    d[0][i] = (b[0] + b[1]) * f4;
+    d[4][i] = (b[0] - b[1]) * f4;
+    d[2][i] = b[2] * f6 + b[3] * f2;
+    d[6][i] = b[3] * f6 - b[2] * f2;
+    b1[4] = b[4] + b[5];
+    b1[7] = b[7] + b[6];
+    b1[5] = b[4] - b[5];
+    b1[6] = b[7] - b[6];
+    d[1][i] = b1[4] * f7 + b1[7] * f1;
+    d[5][i] = b1[5] * f3 + b1[6] * f5;
+    d[7][i] = b1[7] * f7 - b1[4] * f1;
+    d[3][i] = b1[6] * f3 - b1[5] * f5;
+  }
+  for (i = 0; i < 8; i++)
+  {
+    for (j = 0; j < 8; j++)
+    {
+      *(coefs + j + i * 8) = (short) floor(d[i][j] +0.5);
+    }
+  }
+  return;
+}
+
+
+
+void vp8_short_fhaar2x2_c(short *input, short *output, int pitch) //pitch = 8
+{
+    /* [1 1 ; 1 -1] orthogonal transform */
+    /* use position: 0,1, 4, 8 */
+   int i;
+   short *ip1 = input;
+   short *op1 = output;
+   for (i = 0; i < 16; i++)
+   {
+       op1[i] = 0;
+   }
+
+   op1[0]=ip1[0] + ip1[1] + ip1[4] + ip1[8];
+   op1[1]=ip1[0] - ip1[1] + ip1[4] - ip1[8];
+   op1[4]=ip1[0] + ip1[1] - ip1[4] - ip1[8];
+   op1[8]=ip1[0] - ip1[1] - ip1[4] + ip1[8];
+
+}
+#endif
 void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
 {
     int i;
diff --git a/vp8/encoder/dct.h b/vp8/encoder/dct.h
index fec3b4c..c37d47a 100644
--- a/vp8/encoder/dct.h
+++ b/vp8/encoder/dct.h
@@ -22,6 +22,20 @@
 #include "arm/dct_arm.h"
 #endif
 
+#if CONFIG_T8X8
+
+#ifndef vp8_fdct_short8x8
+#define vp8_fdct_short8x8  vp8_short_fdct8x8_c
+#endif
+extern prototype_fdct(vp8_fdct_short8x8);
+
+#ifndef vp8_fhaar_short2x2
+#define vp8_fhaar_short2x2  vp8_short_fhaar2x2_c
+#endif
+extern prototype_fdct(vp8_fhaar_short2x2);
+
+#endif
+
 #ifndef vp8_fdct_short4x4
 #define vp8_fdct_short4x4  vp8_short_fdct4x4_c
 #endif
@@ -49,6 +63,10 @@
 typedef prototype_fdct(*vp8_fdct_fn_t);
 typedef struct
 {
+#if CONFIG_T8X8
+    vp8_fdct_fn_t    short8x8;
+    vp8_fdct_fn_t    haar_short2x2;
+#endif
     vp8_fdct_fn_t    short4x4;
     vp8_fdct_fn_t    short8x4;
     vp8_fdct_fn_t    fast4x4;
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 3cc96c1..1c4a936 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -26,10 +26,12 @@
 #include "vp8/common/findnearmv.h"
 #include "vp8/common/reconintra.h"
 #include <stdio.h>
+#include <math.h>
 #include <limits.h>
 #include "vp8/common/subpixel.h"
 #include "vpx_ports/vpx_timer.h"
 
+
 #if CONFIG_RUNTIME_CPU_DETECT
 #define RTCD(x)     &cpi->common.rtcd.x
 #define IF_RTCD(x)  (x)
@@ -37,6 +39,18 @@
 #define RTCD(x)     NULL
 #define IF_RTCD(x)  NULL
 #endif
+
+#if CONFIG_SEGMENTATION
+#define SEEK_SEGID 12
+#define SEEK_SAMEID 4
+#define SEEK_DIFFID 7
+#endif
+
+#ifdef ENC_DEBUG
+int enc_debug=0;
+int mb_row_debug, mb_col_debug;
+#endif
+
 extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
 
 extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
@@ -52,6 +66,8 @@
 int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
 static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x );
 
+
+
 #ifdef MODE_STATS
 unsigned int inter_y_modes[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 unsigned int inter_uv_modes[4] = {0, 0, 0, 0};
@@ -80,6 +96,186 @@
 };
 
 
+
+#if CONFIG_T8X8
+
+//INTRA mode transform size
+//When all three criteria are off the default is 4x4
+//#define INTRA_VARIANCE_ENTROPY_CRITERIA
+#define INTRA_WTD_SSE_ENTROPY_CRITERIA
+//#define INTRA_TEST_8X8_ONLY
+//
+//INTER mode transform size
+//When all three criteria are off the default is 4x4
+//#define INTER_VARIANCE_ENTROPY_CRITERIA
+#define INTER_WTD_SSE_ENTROPY_CRITERIA
+//#define INTER_TEST_8X8_ONLY
+
+double variance_Block(short *b1, int pitch, int dimension)
+{
+    short ip[8][8]={{0}};
+    short *b = b1;
+    int i, j = 0;
+    double mean = 0.0, variance = 0.0;
+    for (i = 0; i < dimension; i++)
+    {
+        for (j = 0; j < dimension; j++)
+        {
+            ip[i][j] = b[j];
+            mean += ip[i][j];
+        }
+        b += pitch;
+    }
+    mean /= (dimension*dimension);
+
+    for (i = 0; i < dimension; i++)
+    {
+        for (j = 0; j < dimension; j++)
+        {
+            variance += (ip[i][j]-mean)*(ip[i][j]-mean);
+        }
+    }
+    variance /= (dimension*dimension);
+    return variance;
+}
+
+double mean_Block(short *b, int pitch, int dimension)
+{
+    short ip[8][8]={{0}};
+    int i, j = 0;
+    double mean = 0;
+    for (i = 0; i < dimension; i++)
+    {
+        for (j = 0; j < dimension; j++)
+        {
+            ip[i][j] = b[j];
+            mean += ip[i][j];
+        }
+        b += pitch;
+    }
+    mean /= (dimension*dimension);
+
+    return mean;
+}
+
+int SSE_Block(short *b, int pitch, int dimension)
+{
+    int i, j, sse_block = 0;
+    for (i = 0; i < dimension; i++)
+    {
+        for (j = 0; j < dimension; j++)
+        {
+            sse_block += b[j]*b[j];
+        }
+        b += pitch;
+    }
+   return sse_block;
+}
+
+double Compute_Variance_Entropy(MACROBLOCK *x)
+{
+    double variance_8[4] = {0.0, 0.0, 0.0, 0.0}, sum_var = 0.0, all_entropy = 0.0;
+    variance_8[0] = variance_Block(x->block[0].src_diff, 16, 8);
+    variance_8[1] = variance_Block(x->block[2].src_diff, 16, 8);
+    variance_8[2] = variance_Block(x->block[8].src_diff, 16, 8);
+    variance_8[3] = variance_Block(x->block[10].src_diff, 16, 8);
+    sum_var = variance_8[0] + variance_8[1] + variance_8[2] + variance_8[3];
+    if(sum_var)
+    {
+      int i;
+      for(i = 0; i <4; i++)
+      {
+        if(variance_8[i])
+        {
+          variance_8[i] /= sum_var;
+          all_entropy -= variance_8[i]*log(variance_8[i]);
+        }
+      }
+    }
+    return (all_entropy /log(2));
+}
+
+double Compute_Wtd_SSE_SubEntropy(MACROBLOCK *x)
+{
+    double variance_8[4] = {0.0, 0.0, 0.0, 0.0};
+    double entropy_8[4] = {0.0, 0.0, 0.0, 0.0};
+    double sse_1, sse_2, sse_3, sse_4, sse_0;
+    int i;
+    for (i=0;i<3;i+=2)
+    {
+      sse_0 = SSE_Block(x->block[i].src_diff, 16, 8);
+      if(sse_0)
+      {
+        sse_1 = SSE_Block(x->block[i].src_diff, 16, 4)/sse_0;
+        sse_2 = SSE_Block(x->block[i+1].src_diff, 16, 4)/sse_0;
+        sse_3 = SSE_Block(x->block[i+4].src_diff, 16, 4)/sse_0;
+        sse_4 = SSE_Block(x->block[i+5].src_diff, 16, 4)/sse_0;
+        variance_8[i]= variance_Block(x->block[i].src_diff, 16, 8);
+        if(sse_1 && sse_2 && sse_3 && sse_4)
+        entropy_8[i]= (-sse_1*log(sse_1)
+                       -sse_2*log(sse_2)
+                       -sse_3*log(sse_3)
+                       -sse_4*log(sse_4))/log(2);
+      }
+    }
+    for (i=8;i<11;i+=2)
+    {
+      if(sse_0)
+      {
+        sse_0 = SSE_Block(x->block[i].src_diff, 16, 8);
+        sse_1 = SSE_Block(x->block[i].src_diff, 16, 4)/sse_0;
+        sse_2 = SSE_Block(x->block[i+1].src_diff, 16, 4)/sse_0;
+        sse_3 = SSE_Block(x->block[i+4].src_diff, 16, 4)/sse_0;
+        sse_4 = SSE_Block(x->block[i+5].src_diff, 16, 4)/sse_0;
+        variance_8[i-7]= variance_Block(x->block[i].src_diff, 16, 8);
+        if(sse_1 && sse_2 && sse_3 && sse_4)
+        entropy_8[i-7]= (-sse_1*log(sse_1)
+                         -sse_2*log(sse_2)
+                         -sse_3*log(sse_3)
+                         -sse_4*log(sse_4))/log(2);
+      }
+    }
+    if(variance_8[0]+variance_8[1]+variance_8[2]+variance_8[3])
+      return (entropy_8[0]*variance_8[0]+
+              entropy_8[1]*variance_8[1]+
+              entropy_8[2]*variance_8[2]+
+              entropy_8[3]*variance_8[3])/
+             (variance_8[0]+
+              variance_8[1]+
+              variance_8[2]+
+              variance_8[3]);
+    else
+      return 0;
+}
+
+int vp8_8x8_selection_intra(MACROBLOCK *x)
+{
+#ifdef INTRA_VARIANCE_ENTROPY_CRITERIA
+    return (Compute_Variance_Entropy(x) > 1.2);
+#elif defined(INTRA_WTD_SSE_ENTROPY_CRITERIA)
+    return (Compute_Wtd_SSE_SubEntropy(x) > 1.2);
+#elif defined(INTRA_TEST_8X8_ONLY)
+    return 1;
+#else
+    return 0; //when all criteria are off use the default 4x4 only
+#endif
+}
+
+int vp8_8x8_selection_inter(MACROBLOCK *x)
+{
+#ifdef INTER_VARIANCE_ENTROPY_CRITERIA
+    return (Compute_Variance_Entropy(x) > 1.5);
+#elif defined(INTER_WTD_SSE_ENTROPY_CRITERIA)
+    return (Compute_Wtd_SSE_SubEntropy(x) > 1.5);
+#elif defined(INTER_TEST_8X8_ONLY)
+    return 1;
+#else
+    return 0; //when all criteria are off use the default 4x4 only
+#endif
+}
+
+#endif
+
 // Original activity measure from Tim T's code.
 static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )
 {
@@ -376,7 +572,10 @@
     int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
     int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
     int map_index = (mb_row * cpi->common.mb_cols);
-
+#if CONFIG_SEGMENTATION
+    int left_id, above_id;
+    int sum;
+#endif
 #if CONFIG_MULTITHREAD
     const int nsync = cpi->mt_sync_range;
     const int rightmost_col = cm->mb_cols - 1;
@@ -415,6 +614,12 @@
     // for each macroblock col in image
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
     {
+#ifdef ENC_DEBUG
+        //enc_debug = (cpi->count==29 && mb_row==5 && mb_col==0);
+        enc_debug = (cpi->count==4 && mb_row==17 && mb_col==13);
+        mb_col_debug=mb_col;
+        mb_row_debug=mb_row;
+#endif
         // Distance of Mb to the left & right edges, specified in
         // 1/8th pel units as they are always compared to values
         // that are in 1/8th pel units
@@ -461,8 +666,14 @@
         if (xd->segmentation_enabled)
         {
             // Code to set segment id in xd->mbmi.segment_id for current MB (with range checking)
+#if CONFIG_T8X8
+            // Reset segment_id to 0 or 1 so that the default transform mode is 4x4
+            if (cpi->segmentation_map[map_index+mb_col] <= 3)
+                xd->mode_info_context->mbmi.segment_id = cpi->segmentation_map[map_index+mb_col]&1;
+#else
             if (cpi->segmentation_map[map_index+mb_col] <= 3)
                 xd->mode_info_context->mbmi.segment_id = cpi->segmentation_map[map_index+mb_col];
+#endif
             else
                 xd->mode_info_context->mbmi.segment_id = 0;
 
@@ -476,24 +687,27 @@
         if (cm->frame_type == KEY_FRAME)
         {
             *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp);
+            //Note the encoder may have changed the segment_id
+
 #ifdef MODE_STATS
-            y_modes[xd->mbmi.mode] ++;
+            y_modes[xd->mode_info_context->mbmi.mode] ++;
 #endif
         }
         else
         {
             *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset);
+            //Note the encoder may have changed the segment_id
 
 #ifdef MODE_STATS
-            inter_y_modes[xd->mbmi.mode] ++;
+            inter_y_modes[xd->mode_info_context->mbmi.mode] ++;
 
-            if (xd->mbmi.mode == SPLITMV)
+            if (xd->mode_info_context->mbmi.mode == SPLITMV)
             {
                 int b;
 
-                for (b = 0; b < xd->mbmi.partition_count; b++)
+                for (b = 0; b < x->partition_info->count; b++)
                 {
-                    inter_b_modes[x->partition->bmi[b].mode] ++;
+                    inter_b_modes[x->partition_info->bmi[b].mode] ++;
                 }
             }
 
@@ -534,6 +748,12 @@
         // Increment the activity mask pointers.
         x->mb_activity_ptr++;
 
+#if CONFIG_SEGMENTATION
+        if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
+            xd->mode_info_context->mbmi.segment_id = 0;
+        else
+            xd->mode_info_context->mbmi.segment_id = 1;
+#endif
         /* save the block info */
         for (i = 0; i < 16; i++)
             xd->mode_info_context->bmi[i] = xd->block[i].bmi;
@@ -546,9 +766,42 @@
         recon_yoffset += 16;
         recon_uvoffset += 8;
 
-        // Keep track of segment useage
-        segment_counts[xd->mode_info_context->mbmi.segment_id] ++;
+#if CONFIG_SEGMENTATION
+       //cpi->segmentation_map[mb_row * cm->mb_cols + mb_col] =  xd->mbmi.segment_id;
+        if (cm->frame_type == KEY_FRAME)
+        {
+            segment_counts[xd->mode_info_context->mbmi.segment_id]++;
+        }
+        else
+        {
+            sum = 0;
+            if (mb_col != 0)
+                sum += (xd->mode_info_context-1)->mbmi.segment_flag;
+            if (mb_row != 0)
+                sum += (xd->mode_info_context-cm->mb_cols)->mbmi.segment_flag;
 
+            if (xd->mode_info_context->mbmi.segment_id == cpi->segmentation_map[(mb_row*cm->mb_cols) + mb_col])
+                xd->mode_info_context->mbmi.segment_flag = 0;
+            else
+                xd->mode_info_context->mbmi.segment_flag = 1;
+
+            if (xd->mode_info_context->mbmi.segment_flag == 0)
+            {
+                segment_counts[SEEK_SAMEID + sum]++;
+                segment_counts[10]++;
+            }
+            else
+            {
+                segment_counts[SEEK_DIFFID + sum]++;
+                segment_counts[11]++;
+                //calculate individual segment ids
+                segment_counts[xd->mode_info_context->mbmi.segment_id] ++;
+            }
+        }
+        segment_counts[SEEK_SEGID + xd->mode_info_context->mbmi.segment_id] ++;
+#else
+        segment_counts[xd->mode_info_context->mbmi.segment_id] ++;
+#endif
         // skip to next mb
         xd->mode_info_context++;
         x->partition_info++;
@@ -675,7 +928,13 @@
     MACROBLOCKD *const xd = & x->e_mbd;
 
     TOKENEXTRA *tp = cpi->tok;
+#if CONFIG_SEGMENTATION
+    int segment_counts[MAX_MB_SEGMENTS + SEEK_SEGID];
+    int prob[3];
+    int new_cost, original_cost;
+#else
     int segment_counts[MAX_MB_SEGMENTS];
+#endif
     int totalrate;
 
     vpx_memset(segment_counts, 0, sizeof(segment_counts));
@@ -736,7 +995,7 @@
 
     vp8cx_frame_init_quantizer(cpi);
 
-    vp8_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);
+    vp8_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);

     vp8cx_initialize_me_consts(cpi, cm->base_qindex);
 
     if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
@@ -845,41 +1104,126 @@
 
     }
 
-
     // Work out the segment probabilites if segmentation is enabled
     if (xd->segmentation_enabled)
     {
         int tot_count;
         int i;
+        int count1,count2,count3,count4;
 
         // Set to defaults
         vpx_memset(xd->mb_segment_tree_probs, 255 , sizeof(xd->mb_segment_tree_probs));
+#if CONFIG_SEGMENTATION
 
-        tot_count = segment_counts[0] + segment_counts[1] + segment_counts[2] + segment_counts[3];
+        tot_count = segment_counts[12] + segment_counts[13] + segment_counts[14] + segment_counts[15];
+        count1 = segment_counts[12] + segment_counts[13];
+        count2 = segment_counts[14] + segment_counts[15];
 
         if (tot_count)
+            prob[0] = (count1 * 255) / tot_count;
+
+        if (count1 > 0)
+            prob[1] = (segment_counts[12] * 255) /count1;
+
+        if (count2 > 0)
+            prob[2] = (segment_counts[14] * 255) /count2;
+
+        if (cm->frame_type != KEY_FRAME)
         {
-            xd->mb_segment_tree_probs[0] = ((segment_counts[0] + segment_counts[1]) * 255) / tot_count;
+            tot_count = segment_counts[4] + segment_counts[7];
+            if (tot_count)
+                xd->mb_segment_tree_probs[3] = (segment_counts[4] * 255)/tot_count;
 
-            tot_count = segment_counts[0] + segment_counts[1];
+            tot_count = segment_counts[5] + segment_counts[8];
+            if (tot_count)
+                xd->mb_segment_tree_probs[4] = (segment_counts[5] * 255)/tot_count;
 
-            if (tot_count > 0)
-            {
-                xd->mb_segment_tree_probs[1] = (segment_counts[0] * 255) / tot_count;
-            }
+            tot_count = segment_counts[6] + segment_counts[9];
+            if (tot_count)
+                xd->mb_segment_tree_probs[5] = (segment_counts[6] * 255)/tot_count;
+        }
 
-            tot_count = segment_counts[2] + segment_counts[3];
+        tot_count = segment_counts[0] + segment_counts[1] + segment_counts[2] + segment_counts[3];
+        count3 = segment_counts[0] + segment_counts[1];
+        count4 = segment_counts[2] + segment_counts[3];
 
-            if (tot_count > 0)
-                xd->mb_segment_tree_probs[2] = (segment_counts[2] * 255) / tot_count;
+        if (tot_count)
+            xd->mb_segment_tree_probs[0] = (count3 * 255) / tot_count;
 
-            // Zero probabilities not allowed
-            for (i = 0; i < MB_FEATURE_TREE_PROBS; i ++)
+        if (count3 > 0)
+            xd->mb_segment_tree_probs[1] = (segment_counts[0] * 255) /count3;
+
+        if (count4 > 0)
+            xd->mb_segment_tree_probs[2] = (segment_counts[2] * 255) /count4;
+
+        for (i = 0; i < MB_FEATURE_TREE_PROBS+3; i++)
+        {
+            if (xd->mb_segment_tree_probs[i] == 0)
+                xd->mb_segment_tree_probs[i] = 1;
+        }
+
+        original_cost = count1 * vp8_cost_zero(prob[0]) + count2 * vp8_cost_one(prob[0]);
+
+        if (count1 > 0)
+            original_cost += segment_counts[12] * vp8_cost_zero(prob[1]) + segment_counts[13] * vp8_cost_one(prob[1]);
+
+        if (count2 > 0)
+            original_cost += segment_counts[14] * vp8_cost_zero(prob[2]) + segment_counts[15] * vp8_cost_one(prob[2]) ;
+
+        new_cost = 0;
+
+        if (cm->frame_type != KEY_FRAME)
+        {
+            new_cost = segment_counts[4] * vp8_cost_zero(xd->mb_segment_tree_probs[3]) + segment_counts[7] *  vp8_cost_one(xd->mb_segment_tree_probs[3]);
+
+            new_cost += segment_counts[5] * vp8_cost_zero(xd->mb_segment_tree_probs[4]) + segment_counts[8] * vp8_cost_one(xd->mb_segment_tree_probs[4]);
+
+            new_cost += segment_counts[6] * vp8_cost_zero(xd->mb_segment_tree_probs[5]) + segment_counts[9] * vp8_cost_one (xd->mb_segment_tree_probs[5]);
+        }
+
+        if (tot_count > 0)
+            new_cost += count3 * vp8_cost_zero(xd->mb_segment_tree_probs[0]) + count4 * vp8_cost_one(xd->mb_segment_tree_probs[0]);
+
+        if (count3 > 0)
+            new_cost += segment_counts[0] * vp8_cost_zero(xd->mb_segment_tree_probs[1]) + segment_counts[1] * vp8_cost_one(xd->mb_segment_tree_probs[1]);
+
+        if (count4 > 0)
+            new_cost += segment_counts[2] * vp8_cost_zero(xd->mb_segment_tree_probs[2]) + segment_counts[3] * vp8_cost_one(xd->mb_segment_tree_probs[2]) ;
+
+        if (new_cost < original_cost)
+            xd->temporal_update = 1;
+        else
+        {
+            xd->temporal_update = 0;
+            xd->mb_segment_tree_probs[0] = prob[0];
+            xd->mb_segment_tree_probs[1] = prob[1];
+            xd->mb_segment_tree_probs[2] = prob[2];
+        }
+#else
+        tot_count = segment_counts[0] + segment_counts[1] + segment_counts[2] + segment_counts[3];
+        count1 = segment_counts[0] + segment_counts[1];
+        count2 = segment_counts[2] + segment_counts[3];
+
+        if (tot_count)
+            xd->mb_segment_tree_probs[0] = (count1 * 255) / tot_count;
+
+        if (count1 > 0)
+            xd->mb_segment_tree_probs[1] = (segment_counts[0] * 255) /count1;
+
+        if (count2 > 0)
+            xd->mb_segment_tree_probs[2] = (segment_counts[2] * 255) /count2;
+
+#endif
+        // Zero probabilities not allowed
+#if CONFIG_SEGMENTATION
+            for (i = 0; i < MB_FEATURE_TREE_PROBS+3; i++)
+#else
+            for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
+#endif
             {
                 if (xd->mb_segment_tree_probs[i] == 0)
                     xd->mb_segment_tree_probs[i] = 1;
             }
-        }
     }
 
     // 256 rate units to the bit
@@ -1081,7 +1425,7 @@
 
         do
         {
-            ++ bct[xd->block[b].bmi.mode];
+            ++ bct[xd->block[b].bmi.as_mode];
         }
         while (++b < 16);
     }
@@ -1119,6 +1463,10 @@
 {
     int rate;
 
+#if CONFIG_T8X8
+    if (x->e_mbd.segmentation_enabled)
+        x->e_mbd.update_mb_segmentation_map = 1;
+#endif
     if (cpi->sf.RD && cpi->compressor_speed != 2)
         vp8_rd_pick_intra_mode(cpi, x, &rate);
     else
@@ -1133,12 +1481,22 @@
     if (x->e_mbd.mode_info_context->mbmi.mode == B_PRED)
         vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
     else
+    {
+#if CONFIG_T8X8
+        if (x->e_mbd.segmentation_enabled)
+            x->e_mbd.mode_info_context->mbmi.segment_id |= (vp8_8x8_selection_intra(x) << 1);
+#endif
         vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
-
+    }
     vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
     sum_intra_stats(cpi, x);
     vp8_tokenize_mb(cpi, &x->e_mbd, t);
-
+#if CONFIG_T8X8
+        if( x->e_mbd.mode_info_context->mbmi.segment_id >=2)
+            cpi->t8x8_count++;
+        else
+            cpi->t4x4_count++;
+#endif
     return rate;
 }
 #ifdef SPEEDSTATS
@@ -1260,16 +1618,25 @@
 
     cpi->count_mb_ref_frame_usage[xd->mode_info_context->mbmi.ref_frame] ++;
 
+#if CONFIG_T8X8
+    if (xd->segmentation_enabled)
+        x->e_mbd.update_mb_segmentation_map = 1;
+#endif
+
     if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
     {
-        vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
-
         if (xd->mode_info_context->mbmi.mode == B_PRED)
         {
+            vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
             vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
         }
         else
         {
+#if CONFIG_T8X8
+            if (xd->segmentation_enabled)
+              xd->mode_info_context->mbmi.segment_id |= (vp8_8x8_selection_intra(x) << 1);
+#endif
+            vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
             vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
         }
 
@@ -1278,6 +1645,10 @@
     else
     {
         int ref_fb_idx;
+#if CONFIG_T8X8
+        if (xd->segmentation_enabled)
+          xd->mode_info_context->mbmi.segment_id |= (vp8_8x8_selection_inter(x) << 1);
+#endif
 
         vp8_build_uvmvs(xd, cpi->common.full_pixel);
 
@@ -1307,9 +1678,40 @@
                                            xd->dst.y_stride, xd->dst.uv_stride);
 
     }
+#if CONFIG_T8X8
+    if (x->e_mbd.mode_info_context->mbmi.segment_id >=2)
+        cpi->t8x8_count++;
+    else
+        cpi->t4x4_count++;
+#endif
 
     if (!x->skip)
+    {
+#ifdef ENC_DEBUG
+        if (enc_debug)
+        {
+          int i;
+            printf("Segment=%d [%d, %d]: %d %d:\n", x->e_mbd.mode_info_context->mbmi.segment_id, mb_col_debug, mb_row_debug, xd->mb_to_left_edge, xd->mb_to_top_edge);
+            for (i =0; i<400; i++) {
+              printf("%3d ", xd->qcoeff[i]);
+              if (i%16 == 15) printf("\n");
+            }
+            printf("\n");
+            printf("eobs = ");
+            for (i=0;i<25;i++)
+              printf("%d:%d ", i, xd->block[i].eob);
+            printf("\n");
+            fflush(stdout);
+        }
+#endif
         vp8_tokenize_mb(cpi, xd, t);
+#ifdef ENC_DEBUG
+        if (enc_debug) {
+          printf("Tokenized\n");
+          fflush(stdout);
+        }
+#endif
+    }
     else
     {
         if (cpi->common.mb_no_coeff_skip)
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index 59db025..10afed3 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -22,6 +22,10 @@
 #include "encodeintra.h"
 
 
+#ifdef ENC_DEBUG
+extern int enc_debug;
+#endif
+
 #if CONFIG_RUNTIME_CPU_DETECT
 #define IF_RTCD(x) (x)
 #else
@@ -96,15 +100,67 @@
     RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby)(&x->e_mbd);
 
     ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride);
-
+#if CONFIG_T8X8
+    if(x->e_mbd.mode_info_context->mbmi.segment_id >= 2)
+        vp8_transform_intra_mby_8x8(x);
+    else
+#endif
     vp8_transform_intra_mby(x);
 
-    vp8_quantize_mby(x);
+#if  CONFIG_T8X8
+    if(x->e_mbd.mode_info_context->mbmi.segment_id >= 2)
+      vp8_quantize_mby_8x8(x);
+    else
+#endif
+      vp8_quantize_mby(x);
 
     if (x->optimize)
+    {
+#if CONFIG_T8X8
+      if(x->e_mbd.mode_info_context->mbmi.segment_id >= 2)
+        vp8_optimize_mby_8x8(x, rtcd);
+      else
+#endif
         vp8_optimize_mby(x, rtcd);
+    }
 
-    vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+#if CONFIG_T8X8
+    if(x->e_mbd.mode_info_context->mbmi.segment_id >= 2)
+      vp8_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    else
+#endif
+      vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+
+#ifdef ENC_DEBUG
+    if (enc_debug) {
+      int i;
+      printf("Intra qcoeff:\n");
+      printf("%d %d:\n", x->e_mbd.mb_to_left_edge, x->e_mbd.mb_to_top_edge);
+      for (i =0; i<400; i++) {
+        printf("%3d ", x->e_mbd.qcoeff[i]);
+        if (i%16 == 15) printf("\n");
+      }
+      printf("Intra dqcoeff:\n");
+      for (i =0; i<400; i++) {
+        printf("%3d ", x->e_mbd.dqcoeff[i]);
+        if (i%16 == 15) printf("\n");
+      }
+      printf("Intra diff:\n");
+      for (i =0; i<400; i++) {
+        printf("%3d ", x->e_mbd.diff[i]);
+        if (i%16 == 15) printf("\n");
+      }
+      printf("Intra predictor:\n");
+      for (i =0; i<400; i++) {
+        printf("%3d ", x->e_mbd.predictor[i]);
+        if (i%16 == 15) printf("\n");
+      }
+      printf("eobs:\n");
+      for (i=0;i<25;i++)
+        printf("%d ", x->e_mbd.block[i].eob);
+      printf("\n");
+    }
+#endif
 
     RECON_INVOKE(&rtcd->common->recon, recon_mby)
         (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
@@ -116,14 +172,66 @@
     RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv)(&x->e_mbd);
 
     ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+#if CONFIG_T8X8
+    if(x->e_mbd.mode_info_context->mbmi.segment_id >= 2)
+        vp8_transform_mbuv_8x8(x);
+    else
+#endif
+        vp8_transform_mbuv(x);
 
-    vp8_transform_mbuv(x);
+#if CONFIG_T8X8
+    if(x->e_mbd.mode_info_context->mbmi.segment_id >= 2)
+        vp8_quantize_mbuv_8x8(x);
+    else
+#endif
+        vp8_quantize_mbuv(x);
 
-    vp8_quantize_mbuv(x);
-
+#ifdef ENC_DEBUG
+    if (enc_debug) {
+      int i;
+      printf("vp8_encode_intra16x16mbuv\n");
+      printf("%d %d:\n", x->e_mbd.mb_to_left_edge, x->e_mbd.mb_to_top_edge);
+      printf("qcoeff:\n");
+      for (i =0; i<400; i++) {
+        printf("%3d ", x->e_mbd.qcoeff[i]);
+        if (i%16 == 15) printf("\n");
+      }
+      printf("dqcoeff:\n");
+      for (i =0; i<400; i++) {
+        printf("%3d ", x->e_mbd.dqcoeff[i]);
+        if (i%16 == 15) printf("\n");
+      }
+      printf("diff:\n");
+      for (i =0; i<400; i++) {
+        printf("%3d ", x->e_mbd.diff[i]);
+        if (i%16 == 15) printf("\n");
+      }
+      printf("predictor:\n");
+      for (i =0; i<400; i++) {
+        printf("%3d ", x->e_mbd.predictor[i]);
+        if (i%16 == 15) printf("\n");
+      }
+      printf("eobs:\n");
+      for (i=0;i<25;i++)
+        printf("%d ", x->e_mbd.block[i].eob);
+      printf("\n");
+    }
+#endif
     if (x->optimize)
+    {
+#if CONFIG_T8X8
+      if(x->e_mbd.mode_info_context->mbmi.segment_id >= 2)
+        vp8_optimize_mbuv_8x8(x, rtcd);
+      else
+#endif
         vp8_optimize_mbuv(x, rtcd);
+    }
 
+#if CONFIG_T8X8
+    if(x->e_mbd.mode_info_context->mbmi.segment_id >= 2)
+      vp8_inverse_transform_mbuv_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    else
+#endif
     vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
 
     vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index eb89bba..408a595 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -26,6 +26,11 @@
 #else
 #define IF_RTCD(x) NULL
 #endif
+
+#ifdef ENC_DEBUG
+extern int enc_debug;
+#endif
+
 void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
 {
     unsigned char *src_ptr = (*(be->base_src) + be->src);
@@ -117,7 +122,21 @@
         src_diff_ptr[i] = x->coeff[i * 16];
     }
 }
-
+#if CONFIG_T8X8
+void vp8_build_dcblock_8x8(MACROBLOCK *x)
+{
+    short *src_diff_ptr = &x->src_diff[384];
+    int i;
+    for (i = 0; i < 16; i++)
+    {
+        src_diff_ptr[i] = 0;
+    }
+    src_diff_ptr[0] = x->coeff[0 * 16];
+    src_diff_ptr[1] = x->coeff[4 * 16];
+    src_diff_ptr[4] = x->coeff[8 * 16];
+    src_diff_ptr[8] = x->coeff[12 * 16];
+}
+#endif
 void vp8_transform_mbuv(MACROBLOCK *x)
 {
     int i;
@@ -197,10 +216,104 @@
     }
 }
 
+#if CONFIG_T8X8
 
+void vp8_transform_mbuv_8x8(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 16; i < 24; i += 4)
+    {
+        x->vp8_short_fdct8x8(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 16);
+    }
+}
+
+
+void vp8_transform_intra_mby_8x8(MACROBLOCK *x)//changed
+{
+    int i;
+
+    for (i = 0; i < 9; i += 8)
+    {
+        x->vp8_short_fdct8x8(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 32);
+    }
+    for (i = 2; i < 11; i += 8)
+    {
+        x->vp8_short_fdct8x8(&x->block[i].src_diff[0],
+            &x->block[i+2].coeff[0], 32);
+    }
+    // build dc block from 16 y dc values
+    vp8_build_dcblock_8x8(x);
+    //vp8_build_dcblock(x);
+
+    // do 2nd order transform on the dc block
+    x->short_fhaar2x2(&x->block[24].src_diff[0],
+        &x->block[24].coeff[0], 8);
+
+}
+
+
+void vp8_transform_mb_8x8(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 0; i < 9; i += 8)
+    {
+        x->vp8_short_fdct8x8(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 32);
+    }
+    for (i = 2; i < 11; i += 8)
+    {
+        x->vp8_short_fdct8x8(&x->block[i].src_diff[0],
+            &x->block[i+2].coeff[0], 32);
+    }
+    // build dc block from 16 y dc values
+    if (x->e_mbd.mode_info_context->mbmi.mode != B_PRED &&x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
+        vp8_build_dcblock_8x8(x);
+      //vp8_build_dcblock(x);
+
+    for (i = 16; i < 24; i += 4)
+    {
+        x->vp8_short_fdct8x8(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 16);
+    }
+
+    // do 2nd order transform on the dc block
+    if (x->e_mbd.mode_info_context->mbmi.mode != B_PRED &&x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
+        x->short_fhaar2x2(&x->block[24].src_diff[0],
+        &x->block[24].coeff[0], 8);
+}
+
+void vp8_transform_mby_8x8(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 0; i < 9; i += 8)
+    {
+        x->vp8_short_fdct8x8(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 32);
+    }
+    for (i = 2; i < 11; i += 8)
+    {
+        x->vp8_short_fdct8x8(&x->block[i].src_diff[0],
+            &x->block[i+2].coeff[0], 32);
+    }
+    // build dc block from 16 y dc values
+    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
+    {
+        //vp8_build_dcblock(x);
+        vp8_build_dcblock_8x8(x);
+        x->short_fhaar2x2(&x->block[24].src_diff[0],
+            &x->block[24].coeff[0], 8);
+    }
+}
+
+#endif
 
 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
-
+#define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 typedef struct vp8_token_state vp8_token_state;
 
 struct vp8_token_state{
@@ -581,27 +694,554 @@
     }
 }
 
+#if CONFIG_T8X8
+void optimize_b_8x8(MACROBLOCK *mb, int i, int type,
+                    ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+                    ENTROPY_CONTEXT *a1, ENTROPY_CONTEXT *l1,
+                    const VP8_ENCODER_RTCD *rtcd)
+{
+    BLOCK *b;
+    BLOCKD *d;
+    vp8_token_state tokens[65][2];
+    unsigned best_mask[2];
+    const short *dequant_ptr;
+    const short *coeff_ptr;
+    short *qcoeff_ptr;
+    short *dqcoeff_ptr;
+    int eob;
+    int i0;
+    int rc;
+    int x;
+    int sz = 0;
+    int next;
+    int rdmult;
+    int rddiv;
+    int final_eob;
+    int rd_cost0;
+    int rd_cost1;
+    int rate0;
+    int rate1;
+    int error0;
+    int error1;
+    int t0;
+    int t1;
+    int best;
+    int band;
+    int pt;
+
+    b = &mb->block[i];
+    d = &mb->e_mbd.block[i];
+
+    /* Enable this to test the effect of RDO as a replacement for the dynamic
+     *  zero bin instead of an augmentation of it.
+     */
+#if 0
+    vp8_strict_quantize_b(b, d);
+#endif
+
+    dequant_ptr = d->dequant;
+    coeff_ptr = b->coeff;
+    qcoeff_ptr = d->qcoeff;
+    dqcoeff_ptr = d->dqcoeff;
+    i0 = !type;
+    eob = d->eob;
+
+    /* Now set up a Viterbi trellis to evaluate alternative roundings. */
+    /* TODO: These should vary with the block type, since the quantizer does. */
+    rdmult = mb->rdmult << 2;
+    rddiv = mb->rddiv;
+    best_mask[0] = best_mask[1] = 0;
+    /* Initialize the sentinel node of the trellis. */
+    tokens[eob][0].rate = 0;
+    tokens[eob][0].error = 0;
+    tokens[eob][0].next = 64;
+    tokens[eob][0].token = DCT_EOB_TOKEN;
+    tokens[eob][0].qc = 0;
+    *(tokens[eob] + 1) = *(tokens[eob] + 0);
+    next = eob;
+    for (i = eob; i-- > i0;)
+    {
+        int base_bits;
+        int d2;
+        int dx;
+
+        rc = vp8_default_zig_zag1d_8x8[i];
+        x = qcoeff_ptr[rc];
+        /* Only add a trellis state for non-zero coefficients. */
+        if (x)
+        {
+            int shortcut=0;
+            error0 = tokens[next][0].error;
+            error1 = tokens[next][1].error;
+            /* Evaluate the first possibility for this state. */
+            rate0 = tokens[next][0].rate;
+            rate1 = tokens[next][1].rate;
+            t0 = (vp8_dct_value_tokens_ptr + x)->Token;
+            /* Consider both possible successor states. */
+            if (next < 64)
+            {
+                band = vp8_coef_bands_8x8[i + 1];
+                pt = vp8_prev_token_class[t0];
+                rate0 +=
+                    mb->token_costs[type][band][pt][tokens[next][0].token];
+                rate1 +=
+                    mb->token_costs[type][band][pt][tokens[next][1].token];
+            }
+            rd_cost0 = RDCOST_8x8(rdmult, rddiv, rate0, error0);
+            rd_cost1 = RDCOST_8x8(rdmult, rddiv, rate1, error1);
+            if (rd_cost0 == rd_cost1)
+            {
+                rd_cost0 = RDTRUNC_8x8(rdmult, rddiv, rate0, error0);
+                rd_cost1 = RDTRUNC_8x8(rdmult, rddiv, rate1, error1);
+            }
+            /* And pick the best. */
+            best = rd_cost1 < rd_cost0;
+            base_bits = *(vp8_dct_value_cost_ptr + x);
+            dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
+            d2 = dx*dx;
+            tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
+            tokens[i][0].error = d2 + (best ? error1 : error0);
+            tokens[i][0].next = next;
+            tokens[i][0].token = t0;
+            tokens[i][0].qc = x;
+            best_mask[0] |= best << i;
+            /* Evaluate the second possibility for this state. */
+            rate0 = tokens[next][0].rate;
+            rate1 = tokens[next][1].rate;
+
+            if((abs(x)*dequant_ptr[rc!=0]>abs(coeff_ptr[rc])) &&
+               (abs(x)*dequant_ptr[rc!=0]<abs(coeff_ptr[rc])+dequant_ptr[rc!=0]))
+                shortcut = 1;
+            else
+                shortcut = 0;
+
+            if(shortcut)
+            {
+                sz = -(x < 0);
+                x -= 2*sz + 1;
+            }
+
+            /* Consider both possible successor states. */
+            if (!x)
+            {
+                /* If we reduced this coefficient to zero, check to see if
+                 *  we need to move the EOB back here.
+                 */
+                t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
+                    DCT_EOB_TOKEN : ZERO_TOKEN;
+                t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
+                    DCT_EOB_TOKEN : ZERO_TOKEN;
+            }
+            else
+            {
+                t0=t1 = (vp8_dct_value_tokens_ptr + x)->Token;
+            }
+            if (next < 64)
+            {
+                band = vp8_coef_bands_8x8[i + 1];
+                if(t0!=DCT_EOB_TOKEN)
+                {
+                    pt = vp8_prev_token_class[t0];
+                    rate0 += mb->token_costs[type][band][pt][
+                        tokens[next][0].token];
+                }
+                if(t1!=DCT_EOB_TOKEN)
+                {
+                    pt = vp8_prev_token_class[t1];
+                    rate1 += mb->token_costs[type][band][pt][
+                        tokens[next][1].token];
+                }
+            }
+
+            rd_cost0 = RDCOST_8x8(rdmult, rddiv, rate0, error0);
+            rd_cost1 = RDCOST_8x8(rdmult, rddiv, rate1, error1);
+            if (rd_cost0 == rd_cost1)
+            {
+                rd_cost0 = RDTRUNC_8x8(rdmult, rddiv, rate0, error0);
+                rd_cost1 = RDTRUNC_8x8(rdmult, rddiv, rate1, error1);
+            }
+            /* And pick the best. */
+            best = rd_cost1 < rd_cost0;
+            base_bits = *(vp8_dct_value_cost_ptr + x);
+
+            if(shortcut)
+            {
+                dx -= (dequant_ptr[rc!=0] + sz) ^ sz;
+                d2 = dx*dx;
+            }
+            tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
+            tokens[i][1].error = d2 + (best ? error1 : error0);
+            tokens[i][1].next = next;
+            tokens[i][1].token =best?t1:t0;
+            tokens[i][1].qc = x;
+            best_mask[1] |= best << i;
+            /* Finally, make this the new head of the trellis. */
+            next = i;
+        }
+        /* There's no choice to make for a zero coefficient, so we don't
+         *  add a new trellis node, but we do need to update the costs.
+         */
+        else
+        {
+            band = vp8_coef_bands_8x8[i + 1];
+            t0 = tokens[next][0].token;
+            t1 = tokens[next][1].token;
+            /* Update the cost of each path if we're past the EOB token. */
+            if (t0 != DCT_EOB_TOKEN)
+            {
+                tokens[next][0].rate += mb->token_costs[type][band][0][t0];
+                tokens[next][0].token = ZERO_TOKEN;
+            }
+            if (t1 != DCT_EOB_TOKEN)
+            {
+                tokens[next][1].rate += mb->token_costs[type][band][0][t1];
+                tokens[next][1].token = ZERO_TOKEN;
+            }
+            /* Don't update next, because we didn't add a new node. */
+        }
+    }
+
+    /* Now pick the best path through the whole trellis. */
+    band = vp8_coef_bands_8x8[i + 1];
+    VP8_COMBINEENTROPYCONTEXTS_8x8(pt, *a, *l, *a1, *l1);
+    rate0 = tokens[next][0].rate;
+    rate1 = tokens[next][1].rate;
+    error0 = tokens[next][0].error;
+    error1 = tokens[next][1].error;
+    t0 = tokens[next][0].token;
+    t1 = tokens[next][1].token;
+    rate0 += mb->token_costs[type][band][pt][t0];
+    rate1 += mb->token_costs[type][band][pt][t1];
+    rd_cost0 = RDCOST_8x8(rdmult, rddiv, rate0, error0);
+    rd_cost1 = RDCOST_8x8(rdmult, rddiv, rate1, error1);
+    if (rd_cost0 == rd_cost1)
+    {
+        rd_cost0 = RDTRUNC_8x8(rdmult, rddiv, rate0, error0);
+        rd_cost1 = RDTRUNC_8x8(rdmult, rddiv, rate1, error1);
+    }
+    best = rd_cost1 < rd_cost0;
+    final_eob = i0 - 1;
+    for (i = next; i < eob; i = next)
+    {
+        x = tokens[i][best].qc;
+        if (x)
+            final_eob = i;
+        rc = vp8_default_zig_zag1d_8x8[i];
+        qcoeff_ptr[rc] = x;
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0];
+        next = tokens[i][best].next;
+        best = (best_mask[best] >> i) & 1;
+    }
+    final_eob++;
+
+    d->eob = final_eob;
+    *a = *l = (d->eob != !type);
+
+}
+
+void optimize_mb_8x8(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
+{
+    int b;
+    int type;
+    int has_2nd_order;
+    ENTROPY_CONTEXT_PLANES t_above, t_left;
+    ENTROPY_CONTEXT *ta;
+    ENTROPY_CONTEXT *tl;
+
+    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+
+    has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+    type = has_2nd_order ? 0 : 3;
+
+    for (b = 0; b < 16; b+=4)
+    {
+        optimize_b_8x8(x, b, type,
+            ta + vp8_block2above[b], tl + vp8_block2left[b],
+            ta + vp8_block2above[b+1], tl + vp8_block2left[b+4],
+            rtcd);
+
+        if(b==0)
+        {
+          *(ta + vp8_block2above[1]) = *(ta + vp8_block2above[4]) = *(ta + vp8_block2above[5]) = *(ta + vp8_block2above[b]);
+          *(tl + vp8_block2left[1]) = *(tl + vp8_block2left[4]) = *(tl + vp8_block2left[5]) = *(tl + vp8_block2left[b]);
+        }
+        else if(b==4)
+        {
+          *(ta + vp8_block2above[2]) = *(ta + vp8_block2above[3]) = *(ta + vp8_block2above[6]) = *(ta + vp8_block2above[7]) = *(ta + vp8_block2above[b]);
+          *(tl + vp8_block2left[2]) = *(tl + vp8_block2left[3]) = *(tl + vp8_block2left[6]) = *(tl + vp8_block2left[7]) = *(tl + vp8_block2left[b]);
+          *(ta + vp8_block2above[4]) = *(ta + vp8_block2above[1]);
+          *(tl + vp8_block2left[4]) = *(tl + vp8_block2left[1]);
+        }
+        else if(b==8)
+        {
+          *(ta + vp8_block2above[9]) = *(ta + vp8_block2above[12]) = *(ta + vp8_block2above[13]) = *(ta + vp8_block2above[b]);
+          *(tl + vp8_block2left[9]) = *(tl + vp8_block2left[12]) = *(tl + vp8_block2left[13]) = *(tl + vp8_block2left[b]);
+
+        }
+        else if(b==12)
+        {
+          *(ta + vp8_block2above[10]) = *(ta + vp8_block2above[11]) = *(ta + vp8_block2above[14]) = *(ta + vp8_block2above[15]) = *(ta + vp8_block2above[b]);
+          *(tl + vp8_block2left[10]) = *(tl + vp8_block2left[11]) = *(tl + vp8_block2left[14]) = *(tl + vp8_block2left[15]) = *(tl + vp8_block2left[b]);
+          *(ta + vp8_block2above[12]) = *(ta + vp8_block2above[8]);
+          *(tl + vp8_block2left[12]) = *(tl + vp8_block2left[8]);
+
+        }
+
+
+
+    }
+
+    for (b = 16; b < 20; b+=4)
+    {
+        optimize_b_8x8(x, b, PLANE_TYPE_UV, //vp8_block2type[b],
+            ta + vp8_block2above[b], tl + vp8_block2left[b],
+            ta + vp8_block2above[b+1], tl + vp8_block2left[b+2],
+            rtcd);
+        *(ta + vp8_block2above[b+1]) = *(ta + vp8_block2above[b+2]) = *(ta + vp8_block2above[b+3]) =
+            *(ta + vp8_block2above[b]);
+        *(tl + vp8_block2left[b+1]) = *(tl + vp8_block2left[b+2]) = *(tl + vp8_block2left[b+3]) =
+            *(tl + vp8_block2left[b]);
+
+    }
+
+    for (b = 20; b < 24; b+=4)
+    {
+        optimize_b_8x8(x, b, PLANE_TYPE_UV, //vp8_block2type[b],
+            ta + vp8_block2above[b], tl + vp8_block2left[b],
+            ta + vp8_block2above[b+1], tl + vp8_block2left[b+2],
+            rtcd);
+        *(ta + vp8_block2above[b+1]) = *(ta + vp8_block2above[b+2]) = *(ta + vp8_block2above[b+3]) =
+            *(ta + vp8_block2above[b]);
+        *(tl + vp8_block2left[b+1]) = *(tl + vp8_block2left[b+2]) = *(tl + vp8_block2left[b+3]) =
+            *(tl + vp8_block2left[b]);
+
+    }
+
+
+    /*
+    if (has_2nd_order)
+    {
+        vp8_setup_temp_context(&t, x->e_mbd.above_context[Y2CONTEXT],
+            x->e_mbd.left_context[Y2CONTEXT], 1);
+        optimize_b(x, 24, 1, t.a, t.l, rtcd);
+    }
+    */
+}
+
+void vp8_optimize_mby_8x8(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
+{
+    int b;
+    int type;
+    int has_2nd_order;
+
+    ENTROPY_CONTEXT_PLANES t_above, t_left;
+    ENTROPY_CONTEXT *ta;
+    ENTROPY_CONTEXT *tl;
+
+    if (!x->e_mbd.above_context)
+        return;
+
+    if (!x->e_mbd.left_context)
+        return;
+
+    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+
+    has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+    type = has_2nd_order ? 0 : 3;
+
+    for (b = 0; b < 16; b+=4)
+    {
+        optimize_b_8x8(x, b, type,
+        ta + vp8_block2above[b], tl + vp8_block2left[b],
+        ta + vp8_block2above[b+1], tl + vp8_block2left[b+4],
+        rtcd);
+        if(b==0)
+        {
+          *(ta + vp8_block2above[1]) = *(ta + vp8_block2above[4]) = *(ta + vp8_block2above[5]) = *(ta + vp8_block2above[b]);
+          *(tl + vp8_block2left[1]) = *(tl + vp8_block2left[4]) = *(tl + vp8_block2left[5]) = *(tl + vp8_block2left[b]);
+        }
+        else if(b==4)
+        {
+          *(ta + vp8_block2above[2]) = *(ta + vp8_block2above[3]) = *(ta + vp8_block2above[6]) = *(ta + vp8_block2above[7]) = *(ta + vp8_block2above[b]);
+          *(tl + vp8_block2left[2]) = *(tl + vp8_block2left[3]) = *(tl + vp8_block2left[6]) = *(tl + vp8_block2left[7]) = *(tl + vp8_block2left[b]);
+          *(ta + vp8_block2above[4]) = *(ta + vp8_block2above[1]);
+          *(tl + vp8_block2left[4]) = *(tl + vp8_block2left[1]);
+        }
+        else if(b==8)
+        {
+          *(ta + vp8_block2above[9]) = *(ta + vp8_block2above[12]) = *(ta + vp8_block2above[13]) = *(ta + vp8_block2above[b]);
+          *(tl + vp8_block2left[9]) = *(tl + vp8_block2left[12]) = *(tl + vp8_block2left[13]) = *(tl + vp8_block2left[b]);
+
+        }
+        else if(b==12)
+        {
+          *(ta + vp8_block2above[10]) = *(ta + vp8_block2above[11]) = *(ta + vp8_block2above[14]) = *(ta + vp8_block2above[15]) = *(ta + vp8_block2above[b]);
+          *(tl + vp8_block2left[10]) = *(tl + vp8_block2left[11]) = *(tl + vp8_block2left[14]) = *(tl + vp8_block2left[15]) = *(tl + vp8_block2left[b]);
+          *(ta + vp8_block2above[12]) = *(ta + vp8_block2above[8]);
+          *(tl + vp8_block2left[12]) = *(tl + vp8_block2left[8]);
+
+        }
+
+
+    }
+
+    /*
+    if (has_2nd_order)
+    {
+        vp8_setup_temp_context(&t, x->e_mbd.above_context[Y2CONTEXT],
+            x->e_mbd.left_context[Y2CONTEXT], 1);
+        optimize_b(x, 24, 1, t.a, t.l, rtcd);
+    }
+    */
+}
+
+void vp8_optimize_mbuv_8x8(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
+{
+    int b;
+    ENTROPY_CONTEXT_PLANES t_above, t_left;
+    ENTROPY_CONTEXT *ta;
+    ENTROPY_CONTEXT *tl;
+
+    if (!x->e_mbd.above_context)
+        return;
+
+    if (!x->e_mbd.left_context)
+        return;
+
+    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+
+    for (b = 16; b < 20; b+=4)
+    {
+        optimize_b_8x8(x, b, PLANE_TYPE_UV, //vp8_block2type[b],
+            ta + vp8_block2above[b], tl + vp8_block2left[b],
+            ta + vp8_block2above[b+1], tl + vp8_block2left[b+2],
+            rtcd);
+        *(ta + vp8_block2above[b+1]) = *(ta + vp8_block2above[b+2]) = *(ta + vp8_block2above[b+3]) =
+            *(ta + vp8_block2above[b]);
+        *(tl + vp8_block2left[b+1]) = *(tl + vp8_block2left[b+2]) = *(tl + vp8_block2left[b+3]) =
+            *(tl + vp8_block2left[b]);
+
+    }
+
+    for (b = 20; b < 24; b+=4)
+    {
+        optimize_b_8x8(x, b, PLANE_TYPE_UV, //vp8_block2type[b],
+            ta + vp8_block2above[b], tl + vp8_block2left[b],
+            ta + vp8_block2above[b+1], tl + vp8_block2left[b+2],
+            rtcd);
+        *(ta + vp8_block2above[b+1]) = *(ta + vp8_block2above[b+2]) = *(ta + vp8_block2above[b+3]) =
+            *(ta + vp8_block2above[b]);
+        *(tl + vp8_block2left[b+1]) = *(tl + vp8_block2left[b+2]) = *(tl + vp8_block2left[b+3]) =
+            *(tl + vp8_block2left[b]);
+
+    }
+
+}
+#endif
+
 void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
     vp8_build_inter_predictors_mb(&x->e_mbd);
 
     vp8_subtract_mb(rtcd, x);
 
-    transform_mb(x);
+#if  CONFIG_T8X8
+    if(x->e_mbd.mode_info_context->mbmi.segment_id >= 2)
+         vp8_transform_mb_8x8(x);
+    else
+#endif
+         transform_mb(x);
 
-    vp8_quantize_mb(x);
+#if  CONFIG_T8X8
+    if(x->e_mbd.mode_info_context->mbmi.segment_id >= 2)
+        vp8_quantize_mb_8x8(x);
+    else
+#endif
+        vp8_quantize_mb(x);
 
     if (x->optimize)
+    {
+#if CONFIG_T8X8
+      if(x->e_mbd.mode_info_context->mbmi.segment_id >= 2)
+        optimize_mb_8x8(x, rtcd);
+      else
+#endif
         optimize_mb(x, rtcd);
+    }
 
-    vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+#if CONFIG_T8X8
+    if(x->e_mbd.mode_info_context->mbmi.segment_id >= 2)
+        vp8_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    else
+#endif
+        vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    if(x->e_mbd.mode_info_context->mbmi.segment_id >= 2) {
+#ifdef ENC_DEBUG
+        if (enc_debug)
+        {
+          int i;
+          printf("qcoeff:\n");
+          printf("%d %d:\n", x->e_mbd.mb_to_left_edge, x->e_mbd.mb_to_top_edge);
+          for (i =0; i<400; i++) {
+            printf("%3d ", x->e_mbd.qcoeff[i]);
+            if (i%16 == 15) printf("\n");
+          }
+          printf("dqcoeff:\n");
+          for (i =0; i<400; i++) {
+            printf("%3d ", x->e_mbd.dqcoeff[i]);
+            if (i%16 == 15) printf("\n");
+          }
+          printf("diff:\n");
+          for (i =0; i<400; i++) {
+            printf("%3d ", x->e_mbd.diff[i]);
+            if (i%16 == 15) printf("\n");
+          }
+          printf("predictor:\n");
+          for (i =0; i<400; i++) {
+            printf("%3d ", x->e_mbd.predictor[i]);
+            if (i%16 == 15) printf("\n");
+          }
+          printf("\n");
+        }
+#endif
+    }
 
     RECON_INVOKE(&rtcd->common->recon, recon_mb)
         (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+#ifdef ENC_DEBUG
+    if (enc_debug) {
+      int i, j, k;
+      printf("Final Reconstruction\n");
+      for (i =0; i<16; i+=4) {
+        BLOCKD *b = &x->e_mbd.block[i];
+        unsigned char *d = *(b->base_dst) + b->dst;
+        for (k=0; k<4; k++) {
+          for (j=0; j<16; j++)
+            printf("%3d ", d[j]);
+          printf("\n");
+          d+=b->dst_stride;
+        }
+      }
+    }
+#endif
 }
 
 
-/* this funciton is used by first pass only */
+/* this function is used by first pass only */
 void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
     BLOCK *b = &x->block[0];
@@ -610,22 +1250,34 @@
 
     ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride);
 
-    transform_mby(x);
+#if CONFIG_T8X8
+    if(x->e_mbd.mode_info_context->mbmi.segment_id >= 2)
+          vp8_transform_mby_8x8(x);
+    else
+#endif
+          transform_mby(x);
 
     vp8_quantize_mby(x);
-
-    vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+#if CONFIG_T8X8
+    if(x->e_mbd.mode_info_context->mbmi.segment_id >= 2)
+          vp8_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    else
+#endif
+          vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
 
     RECON_INVOKE(&rtcd->common->recon, recon_mby)
         (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
 }
 
-
 void vp8_encode_inter16x16uvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
     vp8_build_inter_predictors_mbuv(&x->e_mbd);
     ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
-
+#if CONFIG_T8X8
+    if(x->e_mbd.mode_info_context->mbmi.segment_id >= 2)
+       vp8_transform_mbuv_8x8(x);
+    else
+#endif
     vp8_transform_mbuv(x);
 
     vp8_quantize_mbuv(x);
diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h
index 47fc72d..73f1ad2 100644
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h
@@ -103,4 +103,16 @@
 void vp8_optimize_mby(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
 void vp8_optimize_mbuv(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
 void vp8_encode_inter16x16y(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+
+#if CONFIG_T8X8
+void vp8_transform_mb_8x8(MACROBLOCK *mb);
+void vp8_transform_mbuv_8x8(MACROBLOCK *x);
+void vp8_transform_intra_mby_8x8(MACROBLOCK *x);
+void vp8_build_dcblock_8x8(MACROBLOCK *b);
+void vp8_optimize_mby_8x8(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
+void vp8_optimize_mbuv_8x8(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
+#endif
+
+
+
 #endif
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 1a37f03..6b9dff8 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -24,6 +24,14 @@
 extern void vp8_build_block_offsets(MACROBLOCK *x);
 extern void vp8_setup_block_ptrs(MACROBLOCK *x);
 
+#ifdef MODE_STATS
+extern unsigned int inter_y_modes[10];
+extern unsigned int inter_uv_modes[4];
+extern unsigned int inter_b_modes[15];
+extern unsigned int y_modes[5];
+extern unsigned int uv_modes[4];
+extern unsigned int b_modes[14];
+#endif
 extern void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
 
 static THREAD_FUNCTION loopfilter_thread(void *p_data)
@@ -175,7 +183,7 @@
                     {
                         *totalrate += vp8cx_encode_intra_macro_block(cpi, x, &tp);
 #ifdef MODE_STATS
-                        y_modes[xd->mbmi.mode] ++;
+                        y_modes[xd->mode_info_context->mbmi.mode] ++;
 #endif
                     }
                     else
@@ -183,15 +191,15 @@
                         *totalrate += vp8cx_encode_inter_macroblock(cpi, x, &tp, recon_yoffset, recon_uvoffset);
 
 #ifdef MODE_STATS
-                        inter_y_modes[xd->mbmi.mode] ++;
+                        inter_y_modes[xd->mode_info_context->mbmi.mode] ++;
 
-                        if (xd->mbmi.mode == SPLITMV)
+                        if (xd->mode_info_context->mbmi.mode == SPLITMV)
                         {
                             int b;
 
-                            for (b = 0; b < xd->mbmi.partition_count; b++)
+                            for (b = 0; b < x->partition_info->count; b++)
                             {
-                                inter_b_modes[x->partition->bmi[b].mode] ++;
+                                inter_b_modes[x->partition_info->bmi[b].mode] ++;
                             }
                         }
 
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index bd1959d..e78514e 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -71,6 +71,10 @@
 
     cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;
 
+#if CONFIG_T8X8
+    cpi->rtcd.fdct.short8x8                  = vp8_short_fdct8x8_c;
+    cpi->rtcd.fdct.haar_short2x2             = vp8_short_fhaar2x2_c;
+#endif
     cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
     cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
     cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_c;
@@ -88,6 +92,12 @@
     cpi->rtcd.quantize.quantb_pair           = vp8_regular_quantize_b_pair;
     cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;
     cpi->rtcd.quantize.fastquantb_pair       = vp8_fast_quantize_b_pair_c;
+#if CONFIG_T8X8
+    cpi->rtcd.quantize.quantb_8x8            = vp8_regular_quantize_b_8x8;
+    cpi->rtcd.quantize.fastquantb_8x8        = vp8_fast_quantize_b_8x8_c;
+    cpi->rtcd.quantize.quantb_2x2            = vp8_regular_quantize_b_2x2;
+    cpi->rtcd.quantize.fastquantb_2x2        = vp8_fast_quantize_b_2x2_c;
+#endif
     cpi->rtcd.search.full_search             = vp8_full_search_sad;
     cpi->rtcd.search.refining_search         = vp8_refining_search_sad;
     cpi->rtcd.search.diamond_search          = vp8_diamond_search_sad;
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index ae0278e..f862fd0 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -105,6 +105,9 @@
 #ifdef OUTPUT_YUV_SRC
 FILE *yuv_file;
 #endif
+#ifdef OUTPUT_YUV_REC
+FILE *yuv_rec_file;
+#endif
 
 #if 0
 FILE *framepsnr;
@@ -130,15 +133,21 @@
 unsigned int cnt_ef = 0;
 #endif
 
+#if defined(SECTIONBITS_OUTPUT)
+extern unsigned __int64 Sectionbits[500];
+#endif
 #ifdef MODE_STATS
 extern unsigned __int64 Sectionbits[50];
 extern int y_modes[5]  ;
 extern int uv_modes[4] ;
 extern int b_modes[10]  ;
-
 extern int inter_y_modes[10] ;
 extern int inter_uv_modes[4] ;
 extern unsigned int inter_b_modes[15];
+#if CONFIG_SEGMENTATION
+extern int segment_modes_intra[MAX_MB_SEGMENTS];
+extern int segment_modes_inter[MAX_MB_SEGMENTS];
+#endif
 #endif
 
 extern void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
@@ -309,7 +318,11 @@
 static void setup_features(VP8_COMP *cpi)
 {
     // Set up default state for MB feature flags
+#if CONFIG_SEGMENTATION
+    cpi->mb.e_mbd.segmentation_enabled = 1;
+#else
     cpi->mb.e_mbd.segmentation_enabled = 0;
+#endif
     cpi->mb.e_mbd.update_mb_segmentation_map = 0;
     cpi->mb.e_mbd.update_mb_segmentation_data = 0;
     vpx_memset(cpi->mb.e_mbd.mb_segment_tree_probs, 255, sizeof(cpi->mb.e_mbd.mb_segment_tree_probs));
@@ -408,7 +421,6 @@
 
     // Copy in the new segmentation map
     vpx_memcpy(cpi->segmentation_map, segmentation_map, (cpi->common.mb_rows * cpi->common.mb_cols));
-
     // Signal that the map should be updated.
     cpi->mb.e_mbd.update_mb_segmentation_map = 1;
     cpi->mb.e_mbd.update_mb_segmentation_data = 1;
@@ -434,12 +446,10 @@
 static void segmentation_test_function(VP8_PTR ptr)
 {
     VP8_COMP *cpi = (VP8_COMP *)(ptr);
-
     unsigned char *seg_map;
     signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
-
+    CHECK_MEM_ERROR(seg_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
     // Create a temporary map for segmentation data.
-    CHECK_MEM_ERROR(seg_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
 
     // MB loop to set local segmentation map
     /*for ( i = 0; i < cpi->common.mb_rows; i++ )
@@ -499,7 +509,7 @@
     int mbs_in_frame = cpi->common.mb_rows * cpi->common.mb_cols;
 
     // Create a temporary map for segmentation data.
-    CHECK_MEM_ERROR(seg_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+    CHECK_MEM_ERROR(seg_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
 
     cpi->cyclic_refresh_q = Q;
 
@@ -1238,16 +1248,25 @@
 
     if (cpi->sf.improved_dct)
     {
+#if CONFIG_T8X8
+        cpi->mb.vp8_short_fdct8x8 = FDCT_INVOKE(&cpi->rtcd.fdct, short8x8);
+#endif
         cpi->mb.vp8_short_fdct8x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4);
         cpi->mb.vp8_short_fdct4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4);
     }
     else
     {
+#if CONFIG_T8X8
+        cpi->mb.vp8_short_fdct8x8 = FDCT_INVOKE(&cpi->rtcd.fdct, short8x8);
+#endif
         cpi->mb.vp8_short_fdct8x4   = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4);
         cpi->mb.vp8_short_fdct4x4   = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4);
     }
 
     cpi->mb.short_walsh4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, walsh_short4x4);
+#if CONFIG_T8X8
+    cpi->mb.short_fhaar2x2 = FDCT_INVOKE(&cpi->rtcd.fdct, haar_short2x2);
+#endif
 
     if (cpi->sf.improved_quant)
     {
@@ -1255,6 +1274,10 @@
                                                   quantb);
         cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
                                                   quantb_pair);
+#if CONFIG_T8X8
+        cpi->mb.quantize_b_8x8  = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb_8x8);
+        cpi->mb.quantize_b_2x2  = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb_2x2);
+#endif
     }
     else
     {
@@ -1262,6 +1285,10 @@
                                                   fastquantb);
         cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
                                                   fastquantb_pair);
+#if CONFIG_T8X8
+        cpi->mb.quantize_b_8x8  = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb_8x8);
+        cpi->mb.quantize_b_2x2  = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb_2x2);
+#endif
     }
     if (cpi->sf.improved_quant != last_improved_quant)
         vp8cx_init_quantizer(cpi);
@@ -1913,7 +1940,7 @@
     CHECK_MEM_ERROR(cpi->lf_ref_frame, vpx_calloc((cpi->common.mb_rows+2) * (cpi->common.mb_cols+2), sizeof(int)));
 
     // Create the encoder segmentation map and set all entries to 0
-    CHECK_MEM_ERROR(cpi->segmentation_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+    CHECK_MEM_ERROR(cpi->segmentation_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
     CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
     vpx_memset(cpi->active_map , 1, (cpi->common.mb_rows * cpi->common.mb_cols));
     cpi->active_map_enabled = 0;
@@ -1949,13 +1976,12 @@
     cpi->cyclic_refresh_q = 32;
 
     if (cpi->cyclic_refresh_mode_enabled)
-    {
         CHECK_MEM_ERROR(cpi->cyclic_refresh_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
-    }
     else
         cpi->cyclic_refresh_map = (signed char *) NULL;
 
     // Test function for segmentation
+
     //segmentation_test_function((VP8_PTR) cpi);
 
 #ifdef ENTROPY_STATS
@@ -2046,6 +2072,9 @@
 #ifdef OUTPUT_YUV_SRC
     yuv_file = fopen("bd.yuv", "ab");
 #endif
+#ifdef OUTPUT_YUV_REC
+    yuv_rec_file = fopen("rec.yuv", "wb");
+#endif
 
 #if 0
     framepsnr = fopen("framepsnr.stt", "a");
@@ -2264,8 +2293,8 @@
 #ifdef MODE_STATS
         {
             extern int count_mb_seg[4];
-            FILE *f = fopen("modes.stt", "a");
-            double dr = (double)cpi->oxcf.frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ;
+            FILE *f = fopen("modes.stt", "w");
+            double dr = (double)cpi->oxcf.frame_rate * (double)cpi->bytes * (double)8 / (double)cpi->count / (double)1000 ;
             fprintf(f, "intra_mode in Intra Frames:\n");
             fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1], y_modes[2], y_modes[3], y_modes[4]);
             fprintf(f, "UV:%8d, %8d, %8d, %8d\n", uv_modes[0], uv_modes[1], uv_modes[2], uv_modes[3]);
@@ -2279,6 +2308,9 @@
                 fprintf(f, "\n");
 
             }
+#if CONFIG_SEGMENTATION
+            fprintf(f, "Segments:%8d, %8d, %8d, %8d\n", segment_modes_intra[0], segment_modes_intra[1], segment_modes_intra[2], segment_modes_intra[3]);
+#endif
 
             fprintf(f, "Modes in Inter Frames:\n");
             fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d\n",
@@ -2298,8 +2330,9 @@
             fprintf(f, "P:%8d, %8d, %8d, %8d\n", count_mb_seg[0], count_mb_seg[1], count_mb_seg[2], count_mb_seg[3]);
             fprintf(f, "PB:%8d, %8d, %8d, %8d\n", inter_b_modes[LEFT4X4], inter_b_modes[ABOVE4X4], inter_b_modes[ZERO4X4], inter_b_modes[NEW4X4]);
 
-
-
+#if CONFIG_SEGMENTATION
+            fprintf(f, "Segments:%8d, %8d, %8d, %8d\n", segment_modes_inter[0], segment_modes_inter[1], segment_modes_inter[2], segment_modes_inter[3]);
+#endif
             fclose(f);
         }
 #endif
@@ -2387,6 +2420,9 @@
 #ifdef OUTPUT_YUV_SRC
     fclose(yuv_file);
 #endif
+#ifdef OUTPUT_YUV_REC
+    fclose(yuv_rec_file);
+#endif
 
 #if 0
 
@@ -2597,10 +2633,9 @@
 }
 
 
-#if OUTPUT_YUV_SRC
-void vp8_write_yuv_frame(const char *name, YV12_BUFFER_CONFIG *s)
+#ifdef OUTPUT_YUV_SRC
+void vp8_write_yuv_frame(YV12_BUFFER_CONFIG *s)
 {
-    FILE *yuv_file = fopen(name, "ab");
     unsigned char *src = s->y_buffer;
     int h = s->y_height;
 
@@ -2630,8 +2665,42 @@
         src += s->uv_stride;
     }
     while (--h);
+}
+#endif
 
-    fclose(yuv_file);
+#ifdef OUTPUT_YUV_REC
+void vp8_write_yuv_rec_frame(VP8_COMMON *cm)
+{
+    YV12_BUFFER_CONFIG *s = cm->frame_to_show;
+    unsigned char *src = s->y_buffer;
+    int h = cm->Height;
+
+    do
+    {
+        fwrite(src, s->y_width, 1,  yuv_rec_file);
+        src += s->y_stride;
+    }
+    while (--h);
+
+    src = s->u_buffer;
+    h = (cm->Height+1)/2;
+
+    do
+    {
+        fwrite(src, s->uv_width, 1,  yuv_rec_file);
+        src += s->uv_stride;
+    }
+    while (--h);
+
+    src = s->v_buffer;
+    h = (cm->Height+1)/2;
+
+    do
+    {
+        fwrite(src, s->uv_width, 1, yuv_rec_file);
+        src += s->uv_stride;
+    }
+    while (--h);
 }
 #endif
 
@@ -3422,6 +3491,10 @@
 
     // Test code for segmentation of gf/arf (0,0)
     //segmentation_test_function((VP8_PTR) cpi);
+#if CONFIG_SEGMENTATION
+    cpi->mb.e_mbd.segmentation_enabled = 1;
+    cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+#endif
 
 #if CONFIG_REALTIME_ONLY
     if(cpi->oxcf.auto_key && cm->frame_type != KEY_FRAME)
@@ -4653,14 +4726,8 @@
         fclose(recon_file);
     }
 #endif
-#if 0
-    // DEBUG
-    if(cm->current_video_frame>173 && cm->current_video_frame<178)
-    {
-        char filename[512];
-        sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
-        vp8_write_yuv_frame(filename, cm->frame_to_show);
-    }
+#ifdef OUTPUT_YUV_REC
+    vp8_write_yuv_rec_frame(cm);
 #endif
 
 }
@@ -5015,7 +5082,7 @@
     }
     else
 #endif
-        encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+    encode_frame_to_data_rate(cpi, size, dest, frame_flags);
 
     if (cpi->compressor_speed == 2)
     {
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 0d347e3..460da21 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -193,7 +193,11 @@
 typedef struct
 {
     MACROBLOCK  mb;
+#if CONFIG_SEGMENTATION
+    int segment_counts[MAX_MB_SEGMENTS + 8];
+#else
     int segment_counts[MAX_MB_SEGMENTS];
+#endif
     int totalrate;
 } MB_ROW_COMP;
 
@@ -407,6 +411,11 @@
     //save vp8_tree_probs_from_distribution result for each frame to avoid repeat calculation
     vp8_prob frame_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
     unsigned int frame_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+#if CONFIG_T8X8
+    unsigned int coef_counts_8x8 [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
+    vp8_prob frame_coef_probs_8x8 [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+    unsigned int frame_branch_ct_8x8 [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+#endif
 
     int gfu_boost;
     int kf_boost;
@@ -465,6 +474,10 @@
     int gf_update_recommended;
     int skip_true_count;
     int skip_false_count;
+#if CONFIG_T8X8

+    int t4x4_count;

+    int t8x8_count;

+#endif
 
     unsigned char *segmentation_map;
     signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];            // Segment data (can be deltas or absolute values)
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index 503d241..328eabb 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -16,6 +16,10 @@
 #include "quantize.h"
 #include "vp8/common/quant_common.h"
 
+#ifdef ENC_DEBUG
+extern int enc_debug;
+#endif
+
 #define EXACT_QUANT
 
 #ifdef EXACT_FASTQUANT
@@ -77,7 +81,11 @@
     short *qcoeff_ptr  = d->qcoeff;
     short *dqcoeff_ptr = d->dqcoeff;
     short *dequant_ptr = d->dequant;
+#if CONFIG_T8X8
 
+    vpx_memset(qcoeff_ptr, 0, 32);
+    vpx_memset(dqcoeff_ptr, 0, 32);
+#endif
     eob = -1;
     for (i = 0; i < 16; i++)
     {
@@ -267,7 +275,8 @@
     d->eob = eob + 1;
 }
 
-#endif
+#endif //EXACT_QUANT
+
 
 void vp8_quantize_mby_c(MACROBLOCK *x)
 {
@@ -301,6 +310,592 @@
         x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
 }
 
+#if CONFIG_T8X8
+
+#ifdef EXACT_FASTQUANT
+void vp8_fast_quantize_b_2x2_c(BLOCK *b, BLOCKD *d)
+{
+  int i, rc, eob;
+  int zbin;
+  int x, y, z, sz;
+  short *coeff_ptr  = b->coeff;
+  short *zbin_ptr   = b->zbin;
+  short *round_ptr  = b->round;
+  short *quant_ptr  = b->quant;
+  short *quant_shift_ptr = b->quant_shift;
+  short *qcoeff_ptr = d->qcoeff;
+  short *dqcoeff_ptr = d->dqcoeff;
+  short *dequant_ptr = d->dequant;
+  //double q2nd = 4;
+
+
+  vpx_memset(qcoeff_ptr, 0, 32);
+  vpx_memset(dqcoeff_ptr, 0, 32);
+
+  eob = -1;
+
+  for (i = 0; i < 4; i++)
+  {
+    rc   = vp8_default_zig_zag1d[i];
+    z    = coeff_ptr[rc];
+    //zbin = zbin_ptr[rc]/q2nd ;
+    zbin = zbin_ptr[rc] ;
+
+    sz = (z >> 31);                                 // sign of z
+    x  = (z ^ sz) - sz;                             // x = abs(z)
+
+    if (x >= zbin)
+    {
+      //x += (round_ptr[rc]/q2nd);
+      x += (round_ptr[rc]);
+      //y  = ((int)((int)(x * quant_ptr[rc] * q2nd) >> 16) + x)
+      //    >> quant_shift_ptr[rc];                // quantize (x)
+      y  = ((int)((int)(x * quant_ptr[rc]) >> 16) + x)
+          >> quant_shift_ptr[rc];                // quantize (x)
+      x  = (y ^ sz) - sz;                         // get the sign back
+      qcoeff_ptr[rc] = x;                          // write to destination
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
+
+      if (y)
+      {
+        eob = i;                                // last nonzero coeffs
+      }
+    }
+  }
+  d->eob = eob + 1;
+}
+
+void vp8_fast_quantize_b_8x8_c(BLOCK *b, BLOCKD *d)// only ac and dc difference, no difference among ac
+{
+  int i, rc, eob;
+  int zbin;
+  int x, y, z, sz;
+  short *coeff_ptr  = b->coeff;
+  short *zbin_ptr   = b->zbin;
+  short *round_ptr  = b->round;
+  short *quant_ptr  = b->quant;
+  short *quant_shift_ptr = b->quant_shift;
+  short *qcoeff_ptr = d->qcoeff;
+  short *dqcoeff_ptr = d->dqcoeff;
+  short *dequant_ptr = d->dequant;
+  //double q1st = 2;
+  vpx_memset(qcoeff_ptr, 0, 64*sizeof(short));
+  vpx_memset(dqcoeff_ptr, 0, 64*sizeof(short));
+
+  eob = -1;
+
+  for (i = 0; i < 64; i++)
+  {
+    rc   = vp8_default_zig_zag1d_8x8[i];
+    z    = coeff_ptr[rc];
+    //zbin = zbin_ptr[rc!=0]/q1st ;
+    zbin = zbin_ptr[rc!=0] ;
+
+    sz = (z >> 31);                                 // sign of z
+    x  = (z ^ sz) - sz;                             // x = abs(z)
+
+    if (x >= zbin)
+    {
+      //x += round_ptr[rc]/q1st;
+      //y  = ((int)(((int)((x * quant_ptr[rc!=0] * q1st)) >> 16) + x))
+      //    >> quant_shift_ptr[rc!=0];                // quantize (x)
+      x += round_ptr[rc];
+      y  = ((int)(((int)((x * quant_ptr[rc!=0])) >> 16) + x))
+          >> quant_shift_ptr[rc!=0];                // quantize (x)
+      x  = (y ^ sz) - sz;                         // get the sign back
+      qcoeff_ptr[rc] = x;                          // write to destination
+      //dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0] / q1st;        // dequantized value
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0];        // dequantized value
+
+      if (y)
+      {
+        eob = i;                                // last nonzero coeffs
+      }
+    }
+  }
+  d->eob = eob + 1;
+}
+
+#else
+
+void vp8_fast_quantize_b_2x2_c(BLOCK *b, BLOCKD *d)
+{
+  int i, rc, eob;
+  int zbin;
+  int x, y, z, sz;
+  short *coeff_ptr  = b->coeff;
+  short *zbin_ptr   = b->zbin;
+  short *round_ptr  = b->round;
+  short *quant_ptr  = b->quant;
+  short *qcoeff_ptr = d->qcoeff;
+  short *dqcoeff_ptr = d->dqcoeff;
+  short *dequant_ptr = d->dequant;
+  //double q2nd = 4;
+  vpx_memset(qcoeff_ptr, 0, 32);
+  vpx_memset(dqcoeff_ptr, 0, 32);
+
+  eob = -1;
+
+  for (i = 0; i < 4; i++)
+  {
+    rc   = vp8_default_zig_zag1d[i];
+    z    = coeff_ptr[rc];
+    //zbin = zbin_ptr[rc]/q2nd;
+    zbin = zbin_ptr[rc];
+
+    sz = (z >> 31);                                 // sign of z
+    x  = (z ^ sz) - sz;                             // x = abs(z)
+
+    if (x >= zbin)
+    {
+      //y  = ((int)((x + round_ptr[rc]/q2nd) * quant_ptr[rc] * q2nd)) >> 16; // quantize (x)
+      y  = ((int)((x + round_ptr[rc]) * quant_ptr[rc])) >> 16; // quantize (x)
+      x  = (y ^ sz) - sz;                         // get the sign back
+      qcoeff_ptr[rc] = x;                          // write to destination
+      //dqcoeff_ptr[rc] = x * dequant_ptr[rc] / q2nd;        // dequantized value
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
+
+      if (y)
+      {
+        eob = i;                                // last nonzero coeffs
+      }
+    }
+  }
+  d->eob = eob + 1;
+  //if (d->eob > 4) printf("Flag Fast 2 (%d)\n", d->eob);
+}
+
+void vp8_fast_quantize_b_8x8_c(BLOCK *b, BLOCKD *d)
+{
+  int i, rc, eob;
+  int zbin;
+  int x, y, z, sz;
+  short *coeff_ptr  = b->coeff;
+  short *zbin_ptr   = b->zbin;
+  short *round_ptr  = b->round;
+  short *quant_ptr  = b->quant;
+  short *qcoeff_ptr = d->qcoeff;
+  short *dqcoeff_ptr = d->dqcoeff;
+  short *dequant_ptr = d->dequant;
+  //double q1st = 2;
+  vpx_memset(qcoeff_ptr, 0, 64*sizeof(short));
+  vpx_memset(dqcoeff_ptr, 0, 64*sizeof(short));
+
+  eob = -1;
+
+  for (i = 0; i < 64; i++)
+  {
+
+    rc   = vp8_default_zig_zag1d_8x8[i];
+    z    = coeff_ptr[rc];
+    //zbin = zbin_ptr[rc!=0]/q1st ;
+    zbin = zbin_ptr[rc!=0] ;
+
+    sz = (z >> 31);                                 // sign of z
+    x  = (z ^ sz) - sz;                             // x = abs(z)
+
+    if (x >= zbin)
+    {
+      //y  = ((int)((x + round_ptr[rc!=0] / q1st) * quant_ptr[rc!=0] * q1st)) >> 16;
+      y  = ((int)((x + round_ptr[rc!=0]) * quant_ptr[rc!=0])) >> 16;
+      x  = (y ^ sz) - sz;                         // get the sign back
+      qcoeff_ptr[rc] = x;                         // write to destination
+      //dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0] / q1st;        // dequantized value
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0];        // dequantized value
+      if (y)
+      {
+        eob = i;                                // last nonzero coeffs
+      }
+    }
+  }
+  d->eob = eob + 1;
+}
+
+#endif //EXACT_FASTQUANT
+
+#ifdef EXACT_QUANT
+void vp8_regular_quantize_b_2x2(BLOCK *b, BLOCKD *d)
+{
+  int i, rc, eob;
+  int zbin;
+  int x, y, z, sz;
+  short *zbin_boost_ptr = b->zrun_zbin_boost;
+  short *coeff_ptr  = b->coeff;
+  short *zbin_ptr   = b->zbin;
+  short *round_ptr  = b->round;
+  short *quant_ptr  = b->quant;
+  unsigned char *quant_shift_ptr = b->quant_shift;
+  short *qcoeff_ptr = d->qcoeff;
+  short *dqcoeff_ptr = d->dqcoeff;
+  short *dequant_ptr = d->dequant;
+  short zbin_oq_value = b->zbin_extra;
+  //double q2nd = 4;
+  vpx_memset(qcoeff_ptr, 0, 32);
+  vpx_memset(dqcoeff_ptr, 0, 32);
+
+  eob = -1;
+
+  for (i = 0; i < 4; i++)
+  {
+    rc   = vp8_default_zig_zag1d[i];
+    z    = coeff_ptr[rc];
+
+    //zbin = (zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value)/q2nd;
+    zbin = (zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value);
+
+    zbin_boost_ptr ++;
+    sz = (z >> 31);                                 // sign of z
+    x  = (z ^ sz) - sz;                             // x = abs(z)
+
+    if (x >= zbin)
+    {
+      //x += (round_ptr[rc]/q2nd);
+      x += (round_ptr[rc]);
+      y  = ((int)((int)(x * quant_ptr[rc]) >> 16) + x)
+           >> quant_shift_ptr[rc];                // quantize (x)
+      x  = (y ^ sz) - sz;                         // get the sign back
+      qcoeff_ptr[rc]  = x;                         // write to destination
+      //dqcoeff_ptr[rc] = x * dequant_ptr[rc]/q2nd;        // dequantized value
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
+
+      if (y)
+      {
+        eob = i;                                // last nonzero coeffs
+        zbin_boost_ptr = &b->zrun_zbin_boost[0];    // reset zero runlength
+      }
+    }
+  }
+
+  d->eob = eob + 1;
+}
+
+void vp8_regular_quantize_b_8x8(BLOCK *b, BLOCKD *d)
+{
+  int i, rc, eob;
+  int zbin;
+  int x, y, z, sz;
+  short *zbin_boost_ptr = b->zrun_zbin_boost;
+  short *coeff_ptr  = b->coeff;
+  short *zbin_ptr   = b->zbin;
+  short *round_ptr  = b->round;
+  short *quant_ptr  = b->quant;
+  unsigned char *quant_shift_ptr = b->quant_shift;
+  short *qcoeff_ptr = d->qcoeff;
+  short *dqcoeff_ptr = d->dqcoeff;
+  short *dequant_ptr = d->dequant;
+  short zbin_oq_value = b->zbin_extra;
+  //double q1st = 2;
+
+  vpx_memset(qcoeff_ptr, 0, 64*sizeof(short));
+  vpx_memset(dqcoeff_ptr, 0, 64*sizeof(short));
+
+  eob = -1;
+
+  for (i = 0; i < 64; i++)
+  {
+
+    rc   = vp8_default_zig_zag1d_8x8[i];
+    z    = coeff_ptr[rc];
+
+    //zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value)/q1st;
+    zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value);
+
+    zbin_boost_ptr ++;
+    sz = (z >> 31);                                 // sign of z
+    x  = (z ^ sz) - sz;                             // x = abs(z)
+
+    if (x >= zbin)
+    {
+      //x += (round_ptr[rc!=0]/q1st);
+      //y  = ((int)(((int)(x * quant_ptr[rc!=0] * q1st) >> 16) + x))
+      //    >> quant_shift_ptr[rc!=0];                // quantize (x)
+      x += (round_ptr[rc!=0]);
+      y  = ((int)(((int)(x * quant_ptr[rc!=0]) >> 16) + x))
+          >> quant_shift_ptr[rc!=0];                // quantize (x)
+      x  = (y ^ sz) - sz;                         // get the sign back
+      qcoeff_ptr[rc]  = x;                         // write to destination
+      //dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0] / q1st;        // dequantized value
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0];        // dequantized value
+
+      if (y)
+      {
+        eob = i;                                // last nonzero coeffs
+        zbin_boost_ptr = &b->zrun_zbin_boost[0];    // reset zero runlength
+      }
+    }
+  }
+
+  d->eob = eob + 1;
+}
+
+void vp8_strict_quantize_b_2x2(BLOCK *b, BLOCKD *d)
+{
+  int i;
+  int rc;
+  int eob;
+  int x;
+  int y;
+  int z;
+  int sz;
+  short *coeff_ptr;
+  short *quant_ptr;
+  unsigned char *quant_shift_ptr;
+  short *qcoeff_ptr;
+  short *dqcoeff_ptr;
+  short *dequant_ptr;
+  //double q2nd = 4;
+  coeff_ptr = b->coeff;
+  quant_ptr = b->quant;
+  quant_shift_ptr = b->quant_shift;
+  qcoeff_ptr = d->qcoeff;
+  dqcoeff_ptr = d->dqcoeff;
+  dequant_ptr = d->dequant;
+  eob = - 1;
+  vpx_memset(qcoeff_ptr, 0, 32);
+  vpx_memset(dqcoeff_ptr, 0, 32);
+  for (i = 0; i < 4; i++)
+  {
+    int dq;
+    int round;
+
+    /*TODO: These arrays should be stored in zig-zag order.*/
+    rc = vp8_default_zig_zag1d[i];
+    z = coeff_ptr[rc];
+    //z = z * q2nd;
+    //dq = dequant_ptr[rc]/q2nd;
+    dq = dequant_ptr[rc];
+    round = dq >> 1;
+    /* Sign of z. */
+    sz = -(z < 0);
+    x = (z + sz) ^ sz;
+    x += round;
+    if (x >= dq)
+    {
+      /* Quantize x */
+      y  = (((x * quant_ptr[rc]) >> 16) + x) >> quant_shift_ptr[rc];
+      /* Put the sign back. */
+      x = (y + sz) ^ sz;
+      /* Save * the * coefficient and its dequantized value. */
+      qcoeff_ptr[rc] = x;
+      dqcoeff_ptr[rc] = x * dq;
+      /* Remember the last non-zero coefficient. */
+      if (y)
+        eob = i;
+    }
+  }
+
+  d->eob = eob + 1;
+}
+
+void vp8_strict_quantize_b_8x8(BLOCK *b, BLOCKD *d)
+{
+  int i;
+  int rc;
+  int eob;
+  int x;
+  int y;
+  int z;
+  int sz;
+  short *coeff_ptr;
+  short *quant_ptr;
+  unsigned char *quant_shift_ptr;
+  short *qcoeff_ptr;
+  short *dqcoeff_ptr;
+  short *dequant_ptr;
+  //double q1st = 2;
+  printf("call strict quantizer\n");
+  coeff_ptr = b->coeff;
+  quant_ptr = b->quant;
+  quant_shift_ptr = b->quant_shift;
+  qcoeff_ptr = d->qcoeff;
+  dqcoeff_ptr = d->dqcoeff;
+  dequant_ptr = d->dequant;
+  eob = - 1;
+  vpx_memset(qcoeff_ptr, 0, 64*sizeof(short));
+  vpx_memset(dqcoeff_ptr, 0, 64*sizeof(short));
+  for (i = 0; i < 64; i++)
+  {
+    int dq;
+    int round;
+
+    /*TODO: These arrays should be stored in zig-zag order.*/
+    rc = vp8_default_zig_zag1d_8x8[i];
+    z = coeff_ptr[rc];
+    //z = z * q1st;
+    //dq = dequant_ptr[rc!=0]/q1st;
+    dq = dequant_ptr[rc!=0];
+    round = dq >> 1;
+    /* Sign of z. */
+    sz = -(z < 0);
+    x = (z + sz) ^ sz;
+    x += round;
+    if (x >= dq)
+    {
+      /* Quantize x. */
+      y  = ((int)(((int)((x * quant_ptr[rc!=0])) >> 16) + x)) >> quant_shift_ptr[rc!=0];
+      /* Put the sign back. */
+      x = (y + sz) ^ sz;
+      /* Save the coefficient and its dequantized value.  * */
+      qcoeff_ptr[rc] = x;
+      dqcoeff_ptr[rc] = x * dq;
+      /* Remember the last non-zero coefficient. */
+      if (y)
+        eob = i;
+    }
+  }
+  d->eob = eob + 1;
+}
+
+#else
+
+void vp8_regular_quantize_b_2x2(BLOCK *b, BLOCKD *d)
+{
+  int i, rc, eob;
+  int zbin;
+  int x, y, z, sz;
+  short *zbin_boost_ptr = b->zrun_zbin_boost;
+  short *coeff_ptr  = b->coeff;
+  short *zbin_ptr   = b->zbin;
+  short *round_ptr  = b->round;
+  short *quant_ptr  = b->quant;
+  short *qcoeff_ptr = d->qcoeff;
+  short *dqcoeff_ptr = d->dqcoeff;
+  short *dequant_ptr = d->dequant;
+  short zbin_oq_value = b->zbin_extra;
+  //double q2nd = 4;
+  vpx_memset(qcoeff_ptr, 0, 32);
+  vpx_memset(dqcoeff_ptr, 0, 32);
+
+  eob = -1;
+  for (i = 0; i < 4; i++)
+  {
+    rc   = vp8_default_zig_zag1d[i];
+    z    = coeff_ptr[rc];
+    //zbin = (zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value)/q2nd;
+    zbin = (zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value);
+    zbin_boost_ptr ++;
+    sz = (z >> 31);                                 // sign of z
+    x  = (z ^ sz) - sz;                             // x = abs(z)
+
+    if (x >= zbin)
+    {
+      //y  = (((x + round_ptr[rc]/q2nd) * quant_ptr[rc]*q2nd)) >> 16; // quantize (x)
+      y  = (((x + round_ptr[rc]) * quant_ptr[rc])) >> 16; // quantize (x)
+      x  = (y ^ sz) - sz;                         // get the sign back
+      qcoeff_ptr[rc]  = x;                         // write to destination
+      //dqcoeff_ptr[rc] = x * dequant_ptr[rc]/q2nd;        // dequantized value
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
+
+      if (y)
+      {
+        eob = i;                                // last nonzero coeffs
+        zbin_boost_ptr = &b->zrun_zbin_boost[0];    // reset zero runlength
+      }
+    }
+  }
+
+  d->eob = eob + 1;
+}
+
+void vp8_regular_quantize_b_8x8(BLOCK *b, BLOCKD *d)
+{
+  int i, rc, eob;
+  int zbin;
+  int x, y, z, sz;
+  short *zbin_boost_ptr = b->zrun_zbin_boost;
+  short *coeff_ptr  = b->coeff;
+  short *zbin_ptr   = b->zbin;
+  short *round_ptr  = b->round;
+  short *quant_ptr  = b->quant;
+  short *qcoeff_ptr = d->qcoeff;
+  short *dqcoeff_ptr = d->dqcoeff;
+  short *dequant_ptr = d->dequant;
+  short zbin_oq_value = b->zbin_extra;
+  //double q1st = 2;
+  vpx_memset(qcoeff_ptr, 0, 64*sizeof(short));
+  vpx_memset(dqcoeff_ptr, 0, 64*sizeof(short));
+
+  eob = -1;
+  for (i = 0; i < 64; i++)
+  {
+
+    rc   = vp8_default_zig_zag1d_8x8[i];
+    z    = coeff_ptr[rc];
+    //zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value)/q1st;
+    zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value);
+    zbin_boost_ptr ++;
+    sz = (z >> 31);                                 // sign of z
+    x  = (z ^ sz) - sz;                             // x = abs(z)
+
+    if (x >= zbin)
+    {
+      //y  = ((x + round_ptr[rc!=0]/q1st) * quant_ptr[rc!=0] * q1st) >> 16;
+      y  = ((x + round_ptr[rc!=0]) * quant_ptr[rc!=0]) >> 16;
+      x  = (y ^ sz) - sz;                         // get the sign back
+      qcoeff_ptr[rc]  = x;                         // write to destination
+      //dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0]/q1st;        // dequantized value
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0];        // dequantized value
+
+      if (y)
+      {
+        eob = i;                                // last nonzero coeffs
+        zbin_boost_ptr = &b->zrun_zbin_boost[0];    // reset zero runlength
+      }
+    }
+  }
+  d->eob = eob + 1;
+}
+
+#endif  //EXACT_QUANT
+
+void vp8_quantize_mby_8x8(MACROBLOCK *x)
+{
+  int i;
+  int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+                     && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+  for(i = 0; i < 16; i ++)
+  {
+    x->e_mbd.block[i].eob = 0;
+  }
+  x->e_mbd.block[24].eob = 0;
+  for (i = 0; i < 16; i+=4)
+    x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);
+
+  if (has_2nd_order)
+    x->quantize_b_2x2(&x->block[24], &x->e_mbd.block[24]);
+
+}
+
+void vp8_quantize_mb_8x8(MACROBLOCK *x)
+{
+  int i;
+  int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+                     && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+  for(i = 0; i < 25; i ++)
+  {
+    x->e_mbd.block[i].eob = 0;
+  }
+  for (i = 0; i < 24; i+=4)
+    x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);
+
+  if (has_2nd_order)
+    x->quantize_b_2x2(&x->block[24], &x->e_mbd.block[24]);
+}
+
+void vp8_quantize_mbuv_8x8(MACROBLOCK *x)
+{
+  int i;
+
+  for(i = 16; i < 24; i ++)
+  {
+    x->e_mbd.block[i].eob = 0;
+  }
+  for (i = 16; i < 24; i+=4)
+    x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);
+}
+
+#endif //CONFIG_T8X8
+
 /* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
  * these two C functions if corresponding optimized routine is not available.
  * NEON optimized version implements currently the fast quantization for pair
diff --git a/vp8/encoder/quantize.h b/vp8/encoder/quantize.h
index f1f0156..1a2bad6 100644
--- a/vp8/encoder/quantize.h
+++ b/vp8/encoder/quantize.h
@@ -45,6 +45,27 @@
 #define vp8_quantize_fastquantb vp8_fast_quantize_b_c
 #endif
 extern prototype_quantize_block(vp8_quantize_fastquantb);
+#if CONFIG_T8X8
+#ifndef vp8_quantize_quantb_8x8
+#define vp8_quantize_quantb_8x8 vp8_regular_quantize_b_8x8
+#endif
+extern prototype_quantize_block(vp8_quantize_quantb_8x8);
+
+#ifndef vp8_quantize_fastquantb_8x8
+#define vp8_quantize_fastquantb_8x8 vp8_fast_quantize_b_8x8_c
+#endif
+extern prototype_quantize_block(vp8_quantize_fastquantb_8x8);
+
+#ifndef vp8_quantize_quantb_2x2
+#define vp8_quantize_quantb_2x2 vp8_regular_quantize_b_2x2
+#endif
+extern prototype_quantize_block(vp8_quantize_quantb_2x2);
+
+#ifndef vp8_quantize_fastquantb_2x2
+#define vp8_quantize_fastquantb_2x2 vp8_fast_quantize_b_2x2_c
+#endif
+extern prototype_quantize_block(vp8_quantize_fastquantb_2x2);
+#endif
 
 #ifndef vp8_quantize_fastquantb_pair
 #define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_c
@@ -56,6 +77,12 @@
     prototype_quantize_block(*quantb);
     prototype_quantize_block_pair(*quantb_pair);
     prototype_quantize_block(*fastquantb);
+#if CONFIG_T8X8
+    prototype_quantize_block(*quantb_8x8);
+    prototype_quantize_block(*fastquantb_8x8);
+    prototype_quantize_block(*quantb_2x2);
+    prototype_quantize_block(*fastquantb_2x2);
+#endif
     prototype_quantize_block_pair(*fastquantb_pair);
 } vp8_quantize_rtcd_vtable_t;
 
@@ -81,7 +108,10 @@
 #endif
 
 extern void vp8_strict_quantize_b(BLOCK *b,BLOCKD *d);
-
+#if CONFIG_T8X8
+extern void vp8_strict_quantize_b_8x8(BLOCK *b,BLOCKD *d);
+extern void vp8_strict_quantize_b_2x2(BLOCK *b,BLOCKD *d);
+#endif
 struct VP8_COMP;
 extern void vp8_set_quantizer(struct VP8_COMP *cpi, int Q);
 extern void vp8cx_frame_init_quantizer(struct VP8_COMP *cpi);
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index f1a3fb3..9529770 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -197,7 +197,6 @@
     61347,64827,69312,73947,78732,83667,89787,97200,
 };
 #endif
-
 /* values are now correlated to quantizer */
 static int sad_per_bit16lut[QINDEX_RANGE] =
 {
@@ -252,8 +251,6 @@
 {
     int q;
     int i;
-    int *thresh;
-    int threshmult;
 
     vp8_clear_system_state();  //__asm emms;
 
@@ -268,7 +265,6 @@
     if (cpi->zbin_over_quant  > 0)
     {
         double oq_factor;
-        double modq;
 
         // Experimental code using the same basic equation as used for Q above
         // The units of cpi->zbin_over_quant are 1/128 of Q bin size
@@ -1055,7 +1051,6 @@
 }
 
 
-
 static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0};
 
 
diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h
index 95134cb..ea04cbf 100644
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h
@@ -13,6 +13,7 @@
 #define __INC_RDOPT_H
 
 #define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
+#define RDCOST_8x8(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
 
 extern void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
 extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c
index 15e7336..e1e1248 100644
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -22,18 +22,27 @@
 
 #ifdef ENTROPY_STATS
 _int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+#if CONFIG_T8X8
+_int64 context_counters_8x8[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+#endif
 #endif
 void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
+#if CONFIG_T8X8
+void vp8_stuff_mb_8x8(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
+#endif
 void vp8_fix_contexts(MACROBLOCKD *x);
 
 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE*2];
 const TOKENVALUE *vp8_dct_value_tokens_ptr;
 static int dct_value_cost[DCT_MAX_VALUE*2];
 const int *vp8_dct_value_cost_ptr;
-#if 0
-int skip_true_count = 0;
-int skip_false_count = 0;
+
+#ifdef ENC_DEBUG
+extern int mb_row_debug;
+extern int mb_col_debug;
+extern int enc_debug;
 #endif
+
 static void fill_value_tokens()
 {
 
@@ -93,6 +102,69 @@
     vp8_dct_value_cost_ptr   = dct_value_cost + DCT_MAX_VALUE;
 }
 
+#if CONFIG_T8X8
+static void tokenize2nd_order_b_8x8
+(
+    const BLOCKD *const b,
+    TOKENEXTRA **tp,
+    const int type,     /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
+    const FRAME_TYPE frametype,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+    VP8_COMP *cpi
+)
+{
+    int pt; /* near block/prev token context index */
+    int c = 0;          /* start at DC */
+    const int eob = b->eob;     /* one beyond last nonzero coeff */
+    TOKENEXTRA *t = *tp;        /* store tokens starting here */
+    int x;
+    const short *qcoeff_ptr = b->qcoeff;
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+    assert(eob<=4);
+
+    do
+    {
+        const int band = vp8_coef_bands[c];
+
+        if (c < eob)
+        {
+            int rc = vp8_default_zig_zag1d[c];
+            const int v = qcoeff_ptr[rc];
+
+            assert(-DCT_MAX_VALUE <= v  &&  v < (DCT_MAX_VALUE));
+
+            t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+            x        = vp8_dct_value_tokens_ptr[v].Token;
+        }
+        else
+            x = DCT_EOB_TOKEN;
+
+        t->Token = x;
+        //printf("Token : %d\n", x);
+        t->context_tree = cpi->common.fc.coef_probs_8x8 [type] [band] [pt];
+
+        t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));
+
+#ifdef ENC_DEBUG
+        if (t->skip_eob_node && vp8_coef_encodings[x].Len==1)
+          printf("Trouble 2 x=%d Len=%d skip=%d eob=%d c=%d band=%d type=%d: [%d %d %d]\n",
+                 x, vp8_coef_encodings[x].Len, t->skip_eob_node, eob, c, band, type,
+                 cpi->count, mb_row_debug, mb_col_debug);
+#endif
+
+        ++cpi->coef_counts_8x8       [type] [band] [pt] [x];
+    }
+    while (pt = vp8_prev_token_class[x], ++t, c < eob  &&  ++c < 4);
+
+    *tp = t;
+    pt = (c != !type); /* 0 <-> all coeff data is zero */
+    *a = *l = pt;
+
+}
+#endif
+
 static void tokenize2nd_order_b
 (
     MACROBLOCKD *x,
@@ -153,6 +225,66 @@
     *a = *l = pt;
 
 }
+#if CONFIG_T8X8
+static void tokenize1st_order_b_8x8
+(
+    const BLOCKD *const b,
+    TOKENEXTRA **tp,
+    const int type,     /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
+    const FRAME_TYPE frametype,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+    ENTROPY_CONTEXT *a1,
+    ENTROPY_CONTEXT *l1,
+    VP8_COMP *cpi
+)
+{
+    int pt; /* near block/prev token context index */
+    int c = type ? 0 : 1;       /* start at DC unless type 0 */
+    const int eob = b->eob;     /* one beyond last nonzero coeff */
+    TOKENEXTRA *t = *tp;        /* store tokens starting here */
+    int x;
+    const short *qcoeff_ptr = b->qcoeff;
+    VP8_COMBINEENTROPYCONTEXTS_8x8(pt, *a, *l, *a1, *l1);
+
+    do
+    {
+        const int band = vp8_coef_bands_8x8[c];
+
+        x = DCT_EOB_TOKEN;
+
+        if (c < eob)
+        {
+            int rc = vp8_default_zig_zag1d_8x8[c];
+            const int v = qcoeff_ptr[rc];
+
+            assert(-DCT_MAX_VALUE <= v  &&  v < (DCT_MAX_VALUE));
+
+            t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+            x        = vp8_dct_value_tokens_ptr[v].Token;
+        }
+
+        t->Token = x;
+        t->context_tree = cpi->common.fc.coef_probs_8x8 [type] [band] [pt];
+
+        t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));
+
+#ifdef ENC_DEBUG
+        if (t->skip_eob_node && vp8_coef_encodings[x].Len==1)
+          printf("Trouble 1 x=%d Len=%d skip=%d eob=%d c=%d band=%d type=%d: [%d %d %d]\n", x, vp8_coef_encodings[x].Len, t->skip_eob_node, eob, c, band, type, cpi->count, mb_row_debug, mb_col_debug);
+#endif
+
+        ++cpi->coef_counts_8x8       [type] [band] [pt] [x];
+    }
+    while (pt = vp8_prev_token_class[x], ++t, c < eob  &&  ++c < 64);
+
+    *tp = t;
+    pt = (c != !type); /* 0 <-> all coeff data is zero */
+    *a = *l = pt;
+}
+
+#endif
+
 
 static void tokenize1st_order_b
 (
@@ -293,22 +425,59 @@
     return skip;
 }
 
+#if CONFIG_T8X8
+static int mb_is_skippable_8x8(MACROBLOCKD *x)
+{
+    int has_y2_block;
+    int skip = 1;
+    int i = 0;
+ 
+    has_y2_block = (x->mode_info_context->mbmi.mode != B_PRED
+                    && x->mode_info_context->mbmi.mode != SPLITMV);
+    if (has_y2_block)
+    {
+        for (i = 0; i < 16; i+=4)
+            skip &= (x->block[i].eob < 2);
+    }
+
+    for (; i < 24 + has_y2_block; i+=4)
+        skip &= (!x->block[i].eob);
+
+    return skip;
+}
+#endif
 
 void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
 {
     int plane_type;
     int has_y2_block;
+    int b;
 
     has_y2_block = (x->mode_info_context->mbmi.mode != B_PRED
                     && x->mode_info_context->mbmi.mode != SPLITMV);
 
-    x->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable(x, has_y2_block);
+    x->mode_info_context->mbmi.mb_skip_coeff =
+#if CONFIG_T8X8
+        (x->mode_info_context->mbmi.segment_id >= 2 ?
+         mb_is_skippable_8x8(x) :
+         mb_is_skippable(x, has_y2_block));
+#else
+         mb_is_skippable(x, has_y2_block);
+#endif
+
     if (x->mode_info_context->mbmi.mb_skip_coeff)
     {
         cpi->skip_true_count++;
 
         if (!cpi->common.mb_no_coeff_skip)
-            vp8_stuff_mb(cpi, x, t) ;
+        {
+#if CONFIG_T8X8
+            if (x->mode_info_context->mbmi.segment_id >= 2)
+                vp8_stuff_mb_8x8(cpi, x, t) ;
+            else
+#endif
+                vp8_stuff_mb(cpi, x, t) ;
+        }
         else
         {
             vp8_fix_contexts(x);
@@ -322,13 +491,82 @@
     plane_type = 3;
     if(has_y2_block)
     {
-        tokenize2nd_order_b(x, t, cpi);
-        plane_type = 0;
+#if CONFIG_T8X8
+        if (x->mode_info_context->mbmi.segment_id >= 2)
+        {
+            ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)x->above_context;
+            ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)x->left_context;
+            tokenize2nd_order_b_8x8(x->block + 24, t, 1, x->frame_type,
+                       A + vp8_block2above[24], L + vp8_block2left[24], cpi);
+        }
+        else
+#endif
+            tokenize2nd_order_b(x, t, cpi);
+
+            plane_type = 0;
 
     }
+#if CONFIG_T8X8
+    if (x->mode_info_context->mbmi.segment_id >= 2)
+    {
+        ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)x->above_context;
+        ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)x->left_context;
+        for (b = 0; b < 16; b+=4)
+        {
+            tokenize1st_order_b_8x8(x->block + b, t, plane_type, x->frame_type,
+                                A + vp8_block2above[b],
+                                L + vp8_block2left[b],
+                                A + vp8_block2above[b+1],
+                                L + vp8_block2left[b+4],
+                                cpi);
+          /* *(A + vp8_block2above[b+1]) = *(A + vp8_block2above[b+2]) = *(A + vp8_block2above[b+3]) =
+                *(A + vp8_block2above[b]);
+            *(L + vp8_block2left[b+1]) = *(L + vp8_block2left[b+2]) = *(L + vp8_block2left[b+3]) =
+                *(L + vp8_block2left[b]);*/
+            // build coeff context for 8x8 transform
+              if(b==0)
+              {
+                *(A + vp8_block2above[1]) = *(A + vp8_block2above[4]) = *(A + vp8_block2above[5]) = *(A + vp8_block2above[b]);
+                *(L + vp8_block2left[1]) = *(L + vp8_block2left[4]) = *(L + vp8_block2left[5]) = *(L + vp8_block2left[b]);
+              }
+              else if(b==4)
+              {
+                *(A + vp8_block2above[2]) = *(A + vp8_block2above[3]) = *(A + vp8_block2above[6]) = *(A + vp8_block2above[7]) = *(A + vp8_block2above[b]);
+                *(L + vp8_block2left[2]) = *(L + vp8_block2left[3]) = *(L + vp8_block2left[6]) = *(L + vp8_block2left[7]) = *(L + vp8_block2left[b]);
+                *(A + vp8_block2above[4]) = *(A + vp8_block2above[1]);
+                *(L + vp8_block2left[4]) = *(L + vp8_block2left[1]);
+              }
+              else if(b==8)
+              {
+                *(A + vp8_block2above[9]) = *(A + vp8_block2above[12]) = *(A + vp8_block2above[13]) = *(A + vp8_block2above[b]);
+                *(L + vp8_block2left[9]) = *(L + vp8_block2left[12]) = *(L + vp8_block2left[13]) = *(L + vp8_block2left[b]);
+              }
+              else if(b==12)
+              {
+                *(A + vp8_block2above[10]) = *(A + vp8_block2above[11]) = *(A + vp8_block2above[14]) = *(A + vp8_block2above[15]) = *(A + vp8_block2above[b]);
+                *(L + vp8_block2left[10]) = *(L + vp8_block2left[11]) = *(L + vp8_block2left[14]) = *(L + vp8_block2left[15]) = *(L + vp8_block2left[b]);
+                *(A + vp8_block2above[12]) = *(A + vp8_block2above[8]);
+                *(L + vp8_block2left[12]) = *(L + vp8_block2left[8]);
+             }
 
-    tokenize1st_order_b(x, t, plane_type, cpi);
+        }
 
+        for (b = 16; b < 24; b+=4) {
+            tokenize1st_order_b_8x8(x->block + b, t, 2, x->frame_type,
+                                    A + vp8_block2above[b],
+                                    L + vp8_block2left[b],
+                                    A + vp8_block2above[b+1],
+                                    L + vp8_block2left[b+2],
+                                    cpi);
+            *(A + vp8_block2above[b+1]) = *(A + vp8_block2above[b+2]) = *(A + vp8_block2above[b+3]) =
+                *(A + vp8_block2above[b]);
+            *(L + vp8_block2left[b+1]) = *(L + vp8_block2left[b+2]) = *(L + vp8_block2left[b+3]) =
+                *(L + vp8_block2left[b]);
+        }
+    }
+    else
+#endif
+        tokenize1st_order_b(x, t, plane_type, cpi);
 }
 
 
@@ -337,6 +575,9 @@
 void init_context_counters(void)
 {
     vpx_memset(context_counters, 0, sizeof(context_counters));
+#if CONFIG_T8X8
+    vpx_memset(context_counters_8x8, 0, sizeof(context_counters_8x8));
+#endif
 }
 
 void print_context_counters()
@@ -381,7 +622,7 @@
                     const _int64 x = context_counters [type] [band] [pt] [t];
                     const int y = (int) x;
 
-                    assert(x == (_int64) y);  /* no overflow handling yet */
+                    assert(x == (INT64) y);  /* no overflow handling yet */
                     fprintf(f, "%s %d", Comma(t), y);
 
                 }
@@ -400,6 +641,56 @@
     }
     while (++type < BLOCK_TYPES);
 
+#if CONFIG_T8X8
+    fprintf(f, "int Contexts_8x8[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];\n\n");
+
+    fprintf(f, "const int default_contexts_8x8[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");
+
+    type = 0;
+
+    do
+    {
+        fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
+
+        band = 0;
+
+        do
+        {
+            fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
+
+            pt = 0;
+
+            do
+            {
+                fprintf(f, "%s\n      {", Comma(pt));
+
+                t = 0;
+
+                do
+                {
+                    const _int64 x = context_counters [type] [band] [pt] [t];
+                    const int y = (int) x;
+
+                    assert(x == (_int64) y);  /* no overflow handling yet */
+                    fprintf(f, "%s %d", Comma(t), y);
+
+                }
+                while (++t < MAX_ENTROPY_TOKENS);
+
+                fprintf(f, "}");
+            }
+            while (++pt < PREV_COEF_CONTEXTS);
+
+            fprintf(f, "\n    }");
+
+        }
+        while (++band < COEF_BANDS);
+
+        fprintf(f, "\n  }");
+    }
+    while (++type < BLOCK_TYPES);
+#endif
+
     fprintf(f, "\n};\n");
     fclose(f);
 }
@@ -411,6 +702,188 @@
     fill_value_tokens();
 }
 
+#if CONFIG_T8X8
+static __inline void stuff2nd_order_b_8x8
+(
+    const BLOCKD *const b,
+    TOKENEXTRA **tp,
+    const int type,     /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
+    const FRAME_TYPE frametype,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+    VP8_COMP *cpi
+)
+{
+    int pt; /* near block/prev token context index */
+    TOKENEXTRA *t = *tp;        /* store tokens starting here */
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+    (void) frametype;
+    (void) type;
+    (void) b;
+
+    t->Token = DCT_EOB_TOKEN;
+    t->context_tree = cpi->common.fc.coef_probs_8x8 [1] [0] [pt];
+    //t->section = 11;
+    t->skip_eob_node = 0;
+    ++cpi->coef_counts_8x8       [1] [0] [pt] [DCT_EOB_TOKEN];
+    ++t;
+
+    *tp = t;
+    pt = 0;
+    *a = *l = pt;
+
+}
+
+static __inline void stuff1st_order_b_8x8
+(
+    const BLOCKD *const b,
+    TOKENEXTRA **tp,
+    const int type,     /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
+    const FRAME_TYPE frametype,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+    ENTROPY_CONTEXT *a1,
+    ENTROPY_CONTEXT *l1,
+    VP8_COMP *cpi
+)
+{
+    int pt; /* near block/prev token context index */
+    TOKENEXTRA *t = *tp;        /* store tokens starting here */
+    VP8_COMBINEENTROPYCONTEXTS_8x8(pt, *a, *l, *a1, *l1);
+    (void) frametype;
+    (void) type;
+    (void) b;
+
+    t->Token = DCT_EOB_TOKEN;
+    t->context_tree = cpi->common.fc.coef_probs_8x8 [0] [1] [pt];
+    //t->section = 8;
+    t->skip_eob_node = 0;
+    ++cpi->coef_counts_8x8       [0] [1] [pt] [DCT_EOB_TOKEN];
+    ++t;
+    *tp = t;
+    pt = 0; /* 0 <-> all coeff data is zero */
+    *a = *l = pt;
+
+
+}
+
+static __inline
+void stuff1st_order_buv_8x8
+(
+    const BLOCKD *const b,
+    TOKENEXTRA **tp,
+    const int type,     /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
+    const FRAME_TYPE frametype,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+    ENTROPY_CONTEXT *a1,
+    ENTROPY_CONTEXT *l1,
+    VP8_COMP *cpi
+)
+{
+    int pt; /* near block/prev token context index */
+    TOKENEXTRA *t = *tp;        /* store tokens starting here */
+    VP8_COMBINEENTROPYCONTEXTS_8x8(pt, *a, *l, *a1, *l1);
+    (void) frametype;
+    (void) type;
+    (void) b;
+
+    t->Token = DCT_EOB_TOKEN;
+    t->context_tree = cpi->common.fc.coef_probs_8x8 [2] [0] [pt];
+    //t->section = 13;
+    t->skip_eob_node = 0;
+    ++cpi->coef_counts_8x8[2] [0] [pt] [DCT_EOB_TOKEN];
+    ++t;
+    *tp = t;
+    pt = 0; /* 0 <-> all coeff data is zero */
+    *a = *l = pt;
+
+}
+
+void vp8_stuff_mb_8x8(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
+{
+    ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)x->above_context;
+    ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)x->left_context;
+    int plane_type;
+    int b;
+
+    stuff2nd_order_b_8x8(x->block + 24, t, 1, x->frame_type,
+                         A + vp8_block2above[24], L + vp8_block2left[24], cpi);
+    plane_type = 0;
+
+    for (b = 0; b < 16; b+=4) {
+        stuff1st_order_b_8x8(x->block + b, t, plane_type, x->frame_type,
+                             A + vp8_block2above[b],
+                             L + vp8_block2left[b],
+                             A + vp8_block2above[b+1],
+                             L + vp8_block2left[b+4],
+                             cpi);
+        // build coeff context for 8x8 transform
+        if(b==0)
+        {
+          *(A + vp8_block2above[1]) = *(A + vp8_block2above[4]) = *(A + vp8_block2above[5]) = *(A + vp8_block2above[b]);
+          *(L + vp8_block2left[1]) = *(L + vp8_block2left[4]) = *(L + vp8_block2left[5]) = *(L + vp8_block2left[b]);
+        }
+        else if(b==4)
+        {
+          *(A + vp8_block2above[2]) = *(A + vp8_block2above[3]) = *(A + vp8_block2above[6]) = *(A + vp8_block2above[7]) = *(A + vp8_block2above[b]);
+          *(L + vp8_block2left[2]) = *(L + vp8_block2left[3]) = *(L + vp8_block2left[6]) = *(L + vp8_block2left[7]) = *(L + vp8_block2left[b]);
+          *(A + vp8_block2above[4]) = *(A + vp8_block2above[1]);
+          *(L + vp8_block2left[4]) = *(L + vp8_block2left[1]);
+        }
+        else if(b==8)
+        {
+          *(A + vp8_block2above[9]) = *(A + vp8_block2above[12]) = *(A + vp8_block2above[13]) = *(A + vp8_block2above[b]);
+          *(L + vp8_block2left[9]) = *(L + vp8_block2left[12]) = *(L + vp8_block2left[13]) = *(L + vp8_block2left[b]);
+
+        }
+        else if(b==12)
+        {
+          *(A + vp8_block2above[10]) = *(A + vp8_block2above[11]) = *(A + vp8_block2above[14]) = *(A + vp8_block2above[15]) = *(A + vp8_block2above[b]);
+          *(L + vp8_block2left[10]) = *(L + vp8_block2left[11]) = *(L + vp8_block2left[14]) = *(L + vp8_block2left[15]) = *(L + vp8_block2left[b]);
+          *(A + vp8_block2above[12]) = *(A + vp8_block2above[8]);
+          *(L + vp8_block2left[12]) = *(L + vp8_block2left[8]);
+
+        }
+
+    }
+    /*
+    for (b = 0; b < 16; b+=4) {
+        stuff1st_order_b_8x8(x->block + b, t, plane_type, x->frame_type,
+                             A + vp8_block2above[b],
+                             L + vp8_block2left[b], cpi);
+        *(A + vp8_block2above[b+1]) = *(A + vp8_block2above[b+2]) = *(A + vp8_block2above[b+3]) =
+            *(A + vp8_block2above[b]);
+        *(L + vp8_block2left[b+1]) = *(L + vp8_block2left[b+2]) = *(L + vp8_block2left[b+3]) =
+            *(L + vp8_block2left[b]);
+    }
+    */
+
+    for (b = 16; b < 24; b+=4) {
+      stuff1st_order_buv_8x8(x->block + b, t, 2, x->frame_type,
+                             A + vp8_block2above[b],
+                             L + vp8_block2left[b],
+                             A + vp8_block2above[b+1],
+                             L + vp8_block2left[b+2],
+                             cpi);
+      *(A + vp8_block2above[b+1]) = *(A + vp8_block2above[b+2]) = *(A + vp8_block2above[b+3]) =
+          *(A + vp8_block2above[b]);
+      *(L + vp8_block2left[b+1]) = *(L + vp8_block2left[b+2]) = *(L + vp8_block2left[b+3]) =
+          *(L + vp8_block2left[b]);
+    }
+    /*
+    for (b = 16; b < 24; b+=4) {
+        stuff1st_order_buv_8x8(x->block + b, t, 2, x->frame_type,
+                               A + vp8_block2above[b],
+                               L + vp8_block2left[b], cpi);
+        *(A + vp8_block2above[b+1]) = *(A + vp8_block2above[b+2]) = *(A + vp8_block2above[b+3]) =
+            *(A + vp8_block2above[b]);
+        *(L + vp8_block2left[b+1]) = *(L + vp8_block2left[b+2]) = *(L + vp8_block2left[b+3]) =
+            *(L + vp8_block2left[b]);
+    }
+    */
+}
+#endif
 
 static __inline void stuff2nd_order_b
 (
diff --git a/vp8/encoder/tokenize.h b/vp8/encoder/tokenize.h
index 04a8879..cd122f1 100644
--- a/vp8/encoder/tokenize.h
+++ b/vp8/encoder/tokenize.h
@@ -38,8 +38,10 @@
 void print_context_counters();
 
 extern _int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+#if CONFIG_T8X8
+extern _int64 context_counters_8x8[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
 #endif
-
+#endif
 extern const int *vp8_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to
  *  improve cache locality, since it's needed for costing when the rest of the
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 053ecae..1a883b8 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -102,6 +102,11 @@
 endif
 
 # common (c)
+ifeq ($(CONFIG_CSM),yes)
+VP8_COMMON_SRCS-yes += common/maskingmv.c
+VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/mask_sse3.asm
+endif
+
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/arm_systemdependent.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.h
diff --git a/vpxenc.c b/vpxenc.c
index ec807d7..aa37990 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -32,6 +32,7 @@
 #include <fcntl.h>
 #include <unistd.h>
 #endif
+#include "vpx_config.h"
 #include "vpx_version.h"
 #include "vpx/vp8cx.h"
 #include "vpx_ports/mem_ops.h"
@@ -76,6 +77,9 @@
     unsigned int             fourcc;
 } codecs[] =
 {
+#if CONFIG_EXPERIMENTAL && CONFIG_VP8_ENCODER
+    {"vp8x",  &vpx_codec_vp8x_cx_algo, 0x78385056},
+#endif
 #if CONFIG_VP8_ENCODER
     {"vp8",  &vpx_codec_vp8_cx_algo, 0x30385056},
 #endif
@@ -1692,7 +1696,11 @@
     /* Handle codec specific options */
 #if CONFIG_VP8_ENCODER
 
-    if (codec->iface == &vpx_codec_vp8_cx_algo)
+    if (codec->iface == &vpx_codec_vp8_cx_algo
+#if CONFIG_EXPERIMENTAL
+        || codec->iface == &vpx_codec_vp8x_cx_algo
+#endif
+        )
     {
         ctrl_args = vp8_args;
         ctrl_args_map = vp8_arg_ctrl_map;