Merge "Removing is_intra_mode() function."

diff --git a/build/make/Makefile b/build/make/Makefile
index 7a25239..030c1b5 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile

@@ -114,6 +114,10 @@
 $(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3
 $(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1
 $(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1
+$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx
+$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx
+$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2
+$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
 
 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")

diff --git a/build/make/configure.sh b/build/make/configure.sh
index c6c8660..83f480a 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh

@@ -1108,6 +1108,18 @@
             soft_enable sse4_1
         fi
 
+        if enabled gcc && ! disabled avx && ! check_cflags -mavx; then
+            RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx "
+        else
+            soft_enable avx
+        fi
+
+        if enabled gcc && ! disabled avx2 && ! check_cflags -mavx2; then
+            RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx2 "
+        else
+            soft_enable avx2
+        fi
+
         case "${AS}" in
             auto|"")
                 which nasm >/dev/null 2>&1 && AS=nasm

diff --git a/build/make/rtcd.sh b/build/make/rtcd.sh
index c531e95..2967b5a 100755
--- a/build/make/rtcd.sh
+++ b/build/make/rtcd.sh

@@ -327,11 +327,11 @@
 require c
 case $arch in
   x86)
-    ALL_ARCHS=$(filter mmx sse sse2 sse3 ssse3 sse4_1)
+    ALL_ARCHS=$(filter mmx sse sse2 sse3 ssse3 sse4_1 avx avx2)
     x86
     ;;
   x86_64)
-    ALL_ARCHS=$(filter mmx sse sse2 sse3 ssse3 sse4_1)
+    ALL_ARCHS=$(filter mmx sse sse2 sse3 ssse3 sse4_1 avx avx2)
     REQUIRES=${REQUIRES:-mmx sse sse2}
     require $(filter $REQUIRES)
     x86

diff --git a/configure b/configure
index 297cec4..621161c 100755
--- a/configure
+++ b/configure

@@ -234,6 +234,8 @@
     sse3
     ssse3
     sse4_1
+    avx
+    avx2
 
     altivec
 "
@@ -422,7 +424,7 @@
     fi
 
     # The write_common_config (config.mk) logic is deferred until after the
-    # recursive calls to configure complete, becuase we want our universal
+    # recursive calls to configure complete, because we want our universal
     # targets to be executed last.
     write_common_config_targets
     enabled universal && echo "FAT_ARCHS=${fat_bin_archs}" >> config.mk
@@ -608,7 +610,12 @@
         check_add_cflags -Wuninitialized
         check_add_cflags -Wunused-variable
         case ${CC} in
-          *clang*) ;;
+          *clang*)
+              # libvpx and/or clang have issues with aliasing:
+              # https://code.google.com/p/webm/issues/detail?id=603
+              # work around them until they are fixed
+              check_add_cflags -fno-strict-aliasing
+          ;;
           *) check_add_cflags -Wunused-but-set-variable ;;
         esac
         enabled extra_warnings || check_add_cflags -Wno-unused-function

diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 6d50644..85f4bb6 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc

@@ -176,14 +176,32 @@
   }
 }
 
-class DatarateTestVP9 : public DatarateTest {
+class DatarateTestVP9 : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ public:
+  DatarateTestVP9() : EncoderTest(GET_PARAM(0)) {}
+
  protected:
   virtual ~DatarateTestVP9() {}
 
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+    ResetModel();
+  }
+
+  virtual void ResetModel() {
+    last_pts_ = 0;
+    frame_number_ = 0;
+    bits_total_ = 0;
+    duration_ = 0.0;
+  }
+
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                     ::libvpx_test::Encoder *encoder) {
     if (video->frame() == 1) {
-      encoder->Control(VP8E_SET_CPUUSED, 2);
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
     }
     const vpx_rational_t tb = video->timebase();
     timebase_ = static_cast<double>(tb.num) / tb.den;
@@ -205,6 +223,14 @@
       effective_datarate_ = ((bits_total_) / 1000.0) / duration_;
     }
   }
+
+  vpx_codec_pts_t last_pts_;
+  double timebase_;
+  int frame_number_;
+  int64_t bits_total_;
+  double duration_;
+  double effective_datarate_;
+  int set_cpu_used_;
 };
 
 // There is no buffer model/frame dropper in VP9 currently, so for now we
@@ -218,7 +244,7 @@
 
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 140);
-  for (int i = 200; i < 800; i += 200) {
+  for (int i = 150; i < 800; i += 200) {
     cfg_.rc_target_bitrate = i;
     ResetModel();
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
@@ -231,5 +257,6 @@
 
 VP8_INSTANTIATE_TEST_CASE(DatarateTest, ALL_TEST_MODES);
 VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9,
-                          ::testing::Values(::libvpx_test::kOnePassGood));
+                          ::testing::Values(::libvpx_test::kOnePassGood),
+                          ::testing::Range(1, 5));
 }  // namespace

diff --git a/test/test_libvpx.cc b/test/test_libvpx.cc
index a4dbca4..80aca98 100644
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc

@@ -45,6 +45,10 @@
     append_gtest_filter(":-SSSE3/*");
   if (!(simd_caps & HAS_SSE4_1))
     append_gtest_filter(":-SSE4_1/*");
+  if (!(simd_caps & HAS_AVX))
+    append_gtest_filter(":-AVX/*");
+  if (!(simd_caps & HAS_AVX2))
+    append_gtest_filter(":-AVX2/*");
 #endif
 
 #if !CONFIG_SHARED

diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc
index 76fc9bb..a8ce6e4 100644
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc

@@ -8,16 +8,19 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/decoder/vp9_thread.h"
+#include <string>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
 #include "test/md5_helper.h"
 #include "test/webm_video_source.h"
+#include "vp9/decoder/vp9_thread.h"
 
 namespace {
 
+using std::string;
+
 class VP9WorkerThreadTest : public ::testing::TestWithParam<bool> {
  protected:
   virtual ~VP9WorkerThreadTest() {}
@@ -91,19 +94,26 @@
   EXPECT_FALSE(worker_.had_error);
 }
 
-TEST(VP9DecodeMTTest, MTDecode) {
-  libvpx_test::WebMVideoSource video("vp90-2-03-size-226x226.webm");
+// -----------------------------------------------------------------------------
+// Multi-threaded decode tests
+
+// Decodes |filename| with |num_threads|. Returns the md5 of the decoded frames.
+string DecodeFile(const string& filename, int num_threads) {
+  libvpx_test::WebMVideoSource video(filename);
   video.Init();
 
   vpx_codec_dec_cfg_t cfg = {0};
-  cfg.threads = 2;
+  cfg.threads = num_threads;
   libvpx_test::VP9Decoder decoder(cfg, 0);
 
   libvpx_test::MD5 md5;
   for (video.Begin(); video.cxdata(); video.Next()) {
     const vpx_codec_err_t res =
         decoder.DecodeFrame(video.cxdata(), video.frame_size());
-    ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+    if (res != VPX_CODEC_OK) {
+      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+      break;
+    }
 
     libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
     const vpx_image_t *img = NULL;
@@ -113,7 +123,32 @@
       md5.Add(img);
     }
   }
-  EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc", md5.Get());
+  return string(md5.Get());
+}
+
+TEST(VP9DecodeMTTest, MTDecode) {
+  // no tiles or frame parallel; this exercises loop filter threading.
+  EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc",
+               DecodeFile("vp90-2-03-size-226x226.webm", 2).c_str());
+}
+
+TEST(VP9DecodeMTTest, MTDecode2) {
+  static const struct {
+    const char *name;
+    const char *expected_md5;
+  } files[] = {
+    { "vp90-2-08-tile_1x2_frame_parallel.webm",
+      "68ede6abd66bae0a2edf2eb9232241b6" },
+    { "vp90-2-08-tile_1x4_frame_parallel.webm",
+      "368ebc6ebf3a5e478d85b2c3149b2848" },
+  };
+
+  for (int i = 0; i < static_cast<int>(sizeof(files) / sizeof(files[0])); ++i) {
+    for (int t = 2; t <= 4; ++t) {
+      EXPECT_STREQ(files[i].expected_md5, DecodeFile(files[i].name, t).c_str())
+          << "threads = " << t;
+    }
+  }
 }
 
 INSTANTIATE_TEST_CASE_P(Synchronous, VP9WorkerThreadTest, ::testing::Bool());

diff --git a/vp8/common/idct_blk.c b/vp8/common/idct_blk.c
index 8edfffb..65d5002 100644
--- a/vp8/common/idct_blk.c
+++ b/vp8/common/idct_blk.c

@@ -10,6 +10,7 @@
 
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
 
 void vp8_dequant_idct_add_c(short *input, short *dq,
                             unsigned char *dest, int stride);
@@ -32,7 +33,7 @@
             else
             {
                 vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride);
-                ((int *)q)[0] = 0;
+                vpx_memset(q, 0, 2 * sizeof(q[0]));
             }
 
             q   += 16;
@@ -58,7 +59,7 @@
             else
             {
                 vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride);
-                ((int *)q)[0] = 0;
+                vpx_memset(q, 0, 2 * sizeof(q[0]));
             }
 
             q    += 16;
@@ -77,7 +78,7 @@
             else
             {
                 vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride);
-                ((int *)q)[0] = 0;
+                vpx_memset(q, 0, 2 * sizeof(q[0]));
             }
 
             q    += 16;

diff --git a/vp8/common/x86/idct_blk_mmx.c b/vp8/common/x86/idct_blk_mmx.c
index 49b2013..a1e4ce6 100644
--- a/vp8/common/x86/idct_blk_mmx.c
+++ b/vp8/common/x86/idct_blk_mmx.c

@@ -11,6 +11,7 @@
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
 
 extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
 
@@ -35,7 +36,7 @@
         else if (eobs[0] == 1)
         {
             vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride);
-            ((int *)q)[0] = 0;
+            vpx_memset(q, 0, 2 * sizeof(q[0]));
         }
 
         if (eobs[1] > 1)
@@ -44,7 +45,7 @@
         {
             vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride,
                                       dst+4, stride);
-            ((int *)(q+16))[0] = 0;
+            vpx_memset(q + 16, 0, 2 * sizeof(q[0]));
         }
 
         if (eobs[2] > 1)
@@ -53,7 +54,7 @@
         {
             vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride,
                                       dst+8, stride);
-            ((int *)(q+32))[0] = 0;
+            vpx_memset(q + 32, 0, 2 * sizeof(q[0]));
         }
 
         if (eobs[3] > 1)
@@ -62,7 +63,7 @@
         {
             vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride,
                                       dst+12, stride);
-            ((int *)(q+48))[0] = 0;
+            vpx_memset(q + 48, 0, 2 * sizeof(q[0]));
         }
 
         q    += 64;
@@ -84,7 +85,7 @@
         else if (eobs[0] == 1)
         {
             vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride);
-            ((int *)q)[0] = 0;
+            vpx_memset(q, 0, 2 * sizeof(q[0]));
         }
 
         if (eobs[1] > 1)
@@ -93,7 +94,7 @@
         {
             vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride,
                                       dstu+4, stride);
-            ((int *)(q+16))[0] = 0;
+            vpx_memset(q + 16, 0, 2 * sizeof(q[0]));
         }
 
         q    += 32;
@@ -108,7 +109,7 @@
         else if (eobs[0] == 1)
         {
             vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride);
-            ((int *)q)[0] = 0;
+            vpx_memset(q, 0, 2 * sizeof(q[0]));
         }
 
         if (eobs[1] > 1)
@@ -117,7 +118,7 @@
         {
             vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride,
                                       dstv+4, stride);
-            ((int *)(q+16))[0] = 0;
+            vpx_memset(q + 16, 0, 2 * sizeof(q[0]));
         }
 
         q    += 32;

diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index 759d842..35a22c7 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c

@@ -512,15 +512,15 @@
                 else
                 {
                     mbmi->mode =  NEARMV;
-                    vp8_clamp_mv2(&near_mvs[CNT_NEAR], &pbi->mb);
                     mbmi->mv.as_int = near_mvs[CNT_NEAR].as_int;
+                    vp8_clamp_mv2(&mbmi->mv, &pbi->mb);
                 }
             }
             else
             {
                 mbmi->mode =  NEARESTMV;
-                vp8_clamp_mv2(&near_mvs[CNT_NEAREST], &pbi->mb);
                 mbmi->mv.as_int = near_mvs[CNT_NEAREST].as_int;
+                vp8_clamp_mv2(&mbmi->mv, &pbi->mb);
             }
         }
         else

diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 50ee9c5..16da78a 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c

@@ -211,7 +211,7 @@
                             (b->qcoeff[0] * DQC[0],
                                 dst, dst_stride,
                                 dst, dst_stride);
-                        ((int *)b->qcoeff)[0] = 0;
+                        vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
                     }
                 }
             }
@@ -248,21 +248,14 @@
 
                     vp8_short_inv_walsh4x4(&b->dqcoeff[0],
                         xd->qcoeff);
-                    ((int *)b->qcoeff)[0] = 0;
-                    ((int *)b->qcoeff)[1] = 0;
-                    ((int *)b->qcoeff)[2] = 0;
-                    ((int *)b->qcoeff)[3] = 0;
-                    ((int *)b->qcoeff)[4] = 0;
-                    ((int *)b->qcoeff)[5] = 0;
-                    ((int *)b->qcoeff)[6] = 0;
-                    ((int *)b->qcoeff)[7] = 0;
+                    vpx_memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
                 }
                 else
                 {
                     b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
                     vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
                         xd->qcoeff);
-                    ((int *)b->qcoeff)[0] = 0;
+                    vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
                 }
 
                 /* override the dc dequant constant in order to preserve the

diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 7303189..fe290cf 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c

@@ -227,7 +227,7 @@
                     {
                         vp8_dc_only_idct_add(b->qcoeff[0] * DQC[0],
                                              dst, dst_stride, dst, dst_stride);
-                        ((int *)b->qcoeff)[0] = 0;
+                        vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
                     }
                 }
             }
@@ -264,21 +264,14 @@
 
                     vp8_short_inv_walsh4x4(&b->dqcoeff[0],
                         xd->qcoeff);
-                    ((int *)b->qcoeff)[0] = 0;
-                    ((int *)b->qcoeff)[1] = 0;
-                    ((int *)b->qcoeff)[2] = 0;
-                    ((int *)b->qcoeff)[3] = 0;
-                    ((int *)b->qcoeff)[4] = 0;
-                    ((int *)b->qcoeff)[5] = 0;
-                    ((int *)b->qcoeff)[6] = 0;
-                    ((int *)b->qcoeff)[7] = 0;
+                    vpx_memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
                 }
                 else
                 {
                     b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
                     vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
                         xd->qcoeff);
-                    ((int *)b->qcoeff)[0] = 0;
+                    vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
                 }
 
                 /* override the dc dequant constant in order to preserve the

diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 3347b35..21c91d6 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c

@@ -309,8 +309,8 @@
   192, 128, 64
 };
 
-static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTERS+1]
-                                                  [SWITCHABLE_FILTERS-1] = {
+static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+                                                    [SWITCHABLE_FILTERS - 1] = {
   { 235, 162, },
   { 36, 255, },
   { 34, 3, },
@@ -416,7 +416,7 @@
                       fc->partition_prob[INTER_FRAME][i], 0);
 
   if (cm->mcomp_filter_type == SWITCHABLE) {
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
       update_mode_probs(SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
                         counts->switchable_interp[i],
                         pre_fc->switchable_interp_prob[i],

diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index ab37b75..ea96555 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h

@@ -16,6 +16,7 @@
 
 #define TX_SIZE_CONTEXTS 2
 #define SWITCHABLE_FILTERS 3   // number of switchable filters
+#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
 
 // #define MODE_STATS
 

diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 85ac6d2..218e12e 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c

@@ -16,12 +16,6 @@
 
 #include "vp9/common/vp9_seg_common.h"
 
-struct loop_filter_info {
-  const uint8_t *mblim;
-  const uint8_t *lim;
-  const uint8_t *hev_thr;
-};
-
 // This structure holds bit masks for all 8x8 blocks in a 64x64 region.
 // Each 1 bit represents a position in which we want to apply the loop filter.
 // Left_ entries refer to whether we apply a filter on the border to the
@@ -259,8 +253,8 @@
     if (block_inside_limit < 1)
       block_inside_limit = 1;
 
-    vpx_memset(lfi->lim[lvl], block_inside_limit, SIMD_WIDTH);
-    vpx_memset(lfi->mblim[lvl], (2 * (lvl + 2) + block_inside_limit),
+    vpx_memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
+    vpx_memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
                SIMD_WIDTH);
   }
 }
@@ -268,7 +262,7 @@
 void vp9_loop_filter_init(VP9_COMMON *cm) {
   loop_filter_info_n *lfi = &cm->lf_info;
   struct loopfilter *lf = &cm->lf;
-  int i;
+  int lvl;
 
   // init limits for given sharpness
   update_sharpness(lfi, lf->sharpness_level);
@@ -278,8 +272,8 @@
   lf_init_lut(lfi);
 
   // init hev threshold const vectors
-  for (i = 0; i < 4; i++)
-    vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
+  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
+    vpx_memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
 }
 
 void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
@@ -330,16 +324,14 @@
 
 static int build_lfi(const loop_filter_info_n *lfi_n,
                      const MB_MODE_INFO *mbmi,
-                     struct loop_filter_info *lfi) {
+                     const loop_filter_thresh **lfi) {
   const int seg = mbmi->segment_id;
   const int ref = mbmi->ref_frame[0];
   const int mode = lfi_n->mode_lf_lut[mbmi->mode];
   const int filter_level = lfi_n->lvl[seg][ref][mode];
 
   if (filter_level > 0) {
-    lfi->mblim = lfi_n->mblim[filter_level];
-    lfi->lim = lfi_n->lim[filter_level];
-    lfi->hev_thr = lfi_n->hev_thr[filter_level >> 4];
+    *lfi = &lfi_n->lfthr[filter_level];
     return 1;
   } else {
     return 0;
@@ -351,11 +343,13 @@
                                     unsigned int mask_8x8,
                                     unsigned int mask_4x4,
                                     unsigned int mask_4x4_int,
-                                    const struct loop_filter_info *lfi) {
+                                    const loop_filter_thresh **p_lfi) {
   unsigned int mask;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= 1) {
+    const loop_filter_thresh *lfi = *p_lfi;
+
     if (mask & 1) {
       if (mask_16x16 & 1) {
         vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim,
@@ -379,7 +373,7 @@
       vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, 1);
     s += 8;
-    lfi++;
+    p_lfi++;
     mask_16x16 >>= 1;
     mask_8x8 >>= 1;
     mask_4x4 >>= 1;
@@ -393,12 +387,14 @@
                                      unsigned int mask_4x4,
                                      unsigned int mask_4x4_int,
                                      int only_4x4_1,
-                                     const struct loop_filter_info *lfi) {
+                                     const loop_filter_thresh **p_lfi) {
   unsigned int mask;
   int count;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= count) {
+    const loop_filter_thresh *lfi = *p_lfi;
+
     count = 1;
     if (mask & 1) {
       if (!only_4x4_1) {
@@ -432,7 +428,7 @@
                                         lfi->lim, lfi->hev_thr, 1);
     }
     s += 8 * count;
-    lfi += count;
+    p_lfi += count;
     mask_16x16 >>= count;
     mask_8x8 >>= count;
     mask_4x4 >>= count;
@@ -805,7 +801,7 @@
   unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
   unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
   unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
-  struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+  const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
   int r, c;
 
   for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
@@ -834,7 +830,7 @@
       const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
 
       // Filter level can vary per MI
-      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
+      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]))
         continue;
 
       // Build masks based on the transform size of each block
@@ -925,7 +921,7 @@
   struct buf_2d *const dst = &plane->dst;
   uint8_t* const dst0 = dst->buf;
   unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
-  struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+  const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
   int r, c;
   int row_shift = 3 - ss_x;
   int row_mask = 0xff >> (ss_x << 2);
@@ -938,8 +934,8 @@
     // Determine the vertical edges that need filtering
     for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
       const MODE_INFO *mi = mi_8x8[c];
-      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
-        continue;
+
+      build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]);
     }
     if (!plane->plane_type) {
       mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y);

diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index c698090..62389ea 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h

@@ -46,12 +46,13 @@
 // Need to align this structure so when it is declared and
 // passed it can be loaded into vector registers.
 typedef struct {
-  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
-                  mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
-  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
-                  lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
-  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
-                  hev_thr[4][SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]);
+} loop_filter_thresh;
+
+typedef struct {
+  loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
   uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
   uint8_t mode_lf_lut[MB_MODE_COUNT];
 } loop_filter_info_n;

diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index a823de8..ba2e9d8 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h

@@ -43,7 +43,7 @@
   vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
   vp9_prob partition_prob[FRAME_TYPES][PARTITION_CONTEXTS][PARTITION_TYPES - 1];
   vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
-  vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1]
+  vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                  [SWITCHABLE_FILTERS - 1];
   vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
   vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
@@ -62,7 +62,7 @@
   vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES];
   unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES]
                          [COEF_BANDS][PREV_COEF_CONTEXTS];
-  unsigned int switchable_interp[SWITCHABLE_FILTERS + 1]
+  unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
                                 [SWITCHABLE_FILTERS];
   unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
   unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];

diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index a869dc0..19032bf 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h

@@ -127,14 +127,14 @@
   return get_tx_probs(bsize, context, tx_probs);
 }
 
-static void update_tx_counts(BLOCK_SIZE bsize, uint8_t context,
-                             TX_SIZE tx_size, struct tx_counts *tx_counts) {
-  if (bsize >= BLOCK_32X32)
-    tx_counts->p32x32[context][tx_size]++;
-  else if (bsize >= BLOCK_16X16)
-    tx_counts->p16x16[context][tx_size]++;
+static unsigned int *get_tx_counts(BLOCK_SIZE bsize, uint8_t context,
+                                   struct tx_counts *tx_counts) {
+  if (bsize < BLOCK_16X16)
+    return tx_counts->p8x8[context];
+  else if (bsize < BLOCK_32X32)
+    return tx_counts->p16x16[context];
   else
-    tx_counts->p8x8[context][tx_size]++;
+    return tx_counts->p32x32[context];
 }
 
 #endif  // VP9_COMMON_VP9_PRED_COMMON_H_

diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 53b9003..1c96788 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c

@@ -161,13 +161,12 @@
     // scaling case. It needs to be done on the scaled MV, not the pre-scaling
     // MV. Note however that it performs the subsampling aware scaling so
     // that the result is always q4.
-    const MV res_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
-                                                pd->subsampling_x,
-                                                pd->subsampling_y);
+    // mv_precision precision is MV_PRECISION_Q4.
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+                                               pd->subsampling_x,
+                                               pd->subsampling_y);
 
     uint8_t *pre;
-    // mv_precision precision is MV_PRECISION_Q4.
-    const MV mv_q4 = {res_mv.row, res_mv.col };
     MV32 scaled_mv;
     int xs, ys;
 

diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 2d9fbff..3f3268f 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -22,10 +22,11 @@
 
 # x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly.
 [ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse_x86inc=sse &&
-  sse2_x86inc=sse2 && ssse3_x86inc=ssse3
+  sse2_x86inc=sse2 && ssse3_x86inc=ssse3 && avx_x86inc=avx && avx2_x86inc=avx2
 
 # this variable is for functions that are 64 bit only.
-[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && ssse3_x86_64=ssse3
+[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && 
+  ssse3_x86_64=ssse3 && avx_x86_64=avx && avx2_x86_64=avx2
 
 #
 # RECON
@@ -199,7 +200,7 @@
 specialize vp9_loop_filter_vertical_edge mmx neon
 
 prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mb_lpf_horizontal_edge_w sse2 neon
+specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon
 
 prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_mbloop_filter_horizontal_edge sse2 neon
@@ -671,10 +672,10 @@
 prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
 specialize vp9_subtract_block $sse2_x86inc
 
-prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+prototype void vp9_quantize_b "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
 specialize vp9_quantize_b $ssse3_x86_64
 
-prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+prototype void vp9_quantize_b_32x32 "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
 specialize vp9_quantize_b_32x32 $ssse3_x86_64
 
 #

diff --git a/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
new file mode 100644
index 0000000..3c5cb8f
--- /dev/null
+++ b/vp9/common/x86/vp9_loopfilter_intrin_avx2.c

@@ -0,0 +1,943 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  /* AVX2 */
+
+static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh) {
+    __m128i mask, hev, flat, flat2;
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+    __m128i abs_p1p0;
+
+    const __m128i thresh = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _thresh[0]));
+    const __m128i limit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _limit[0]));
+    const __m128i blimit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _blimit[0]));
+
+    q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
+    q4p4 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
+    q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
+    q3p3 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
+    q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
+    q2p2 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
+    q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
+    q1p1 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
+    p1q1 = _mm_shuffle_epi32(q1p1, 78);
+    q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
+    q0p0 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
+    p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+    {
+        __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+        abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
+                _mm_subs_epu8(q0p0, q1p1));
+        abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+        fe = _mm_set1_epi8(0xfe);
+        ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+        abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
+                _mm_subs_epu8(p0q0, q0p0));
+        abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
+                _mm_subs_epu8(p1q1, q1p1));
+        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+        hev = _mm_subs_epu8(flat, thresh);
+        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+        mask = _mm_max_epu8(abs_p1p0, mask);
+        // mask |= (abs(p1 - p0) > limit) * -1;
+        // mask |= (abs(q1 - q0) > limit) * -1;
+
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
+                        _mm_subs_epu8(q1p1, q2p2)),
+                _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
+                        _mm_subs_epu8(q2p2, q3p3)));
+        mask = _mm_max_epu8(work, mask);
+        mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+        mask = _mm_subs_epu8(mask, limit);
+        mask = _mm_cmpeq_epi8(mask, zero);
+    }
+
+    // lp filter
+    {
+        const __m128i t4 = _mm_set1_epi8(4);
+        const __m128i t3 = _mm_set1_epi8(3);
+        const __m128i t80 = _mm_set1_epi8(0x80);
+        const __m128i t1 = _mm_set1_epi16(0x1);
+        __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+        __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+        __m128i qs0 = _mm_xor_si128(p0q0, t80);
+        __m128i qs1 = _mm_xor_si128(p1q1, t80);
+        __m128i filt;
+        __m128i work_a;
+        __m128i filter1, filter2;
+        __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+        __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+        filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+        work_a = _mm_subs_epi8(qs0, qs0ps0);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        filt = _mm_and_si128(filt, mask);
+
+        filter1 = _mm_adds_epi8(filt, t4);
+        filter2 = _mm_adds_epi8(filt, t3);
+
+        filter1 = _mm_unpacklo_epi8(zero, filter1);
+        filter1 = _mm_srai_epi16(filter1, 0xB);
+        filter2 = _mm_unpacklo_epi8(zero, filter2);
+        filter2 = _mm_srai_epi16(filter2, 0xB);
+
+        /* Filter1 >> 3 */
+        filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+        qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+        /* filt >> 1 */
+        filt = _mm_adds_epi16(filter1, t1);
+        filt = _mm_srai_epi16(filt, 1);
+        filt = _mm_andnot_si128(
+                _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
+        filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+        qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+        // loopfilter done
+
+        {
+            __m128i work;
+            flat = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
+                            _mm_subs_epu8(q0p0, q2p2)),
+                    _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
+                            _mm_subs_epu8(q0p0, q3p3)));
+            flat = _mm_max_epu8(abs_p1p0, flat);
+            flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+            flat = _mm_subs_epu8(flat, one);
+            flat = _mm_cmpeq_epi8(flat, zero);
+            flat = _mm_and_si128(flat, mask);
+
+            q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
+            q5p5 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q5p5),
+                            (__m64 *) (s + 5 * p)));
+
+            q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
+            q6p6 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q6p6),
+                            (__m64 *) (s + 6 * p)));
+
+            flat2 = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
+                            _mm_subs_epu8(q0p0, q4p4)),
+                    _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
+                            _mm_subs_epu8(q0p0, q5p5)));
+
+            q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
+            q7p7 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q7p7),
+                            (__m64 *) (s + 7 * p)));
+
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
+                            _mm_subs_epu8(q0p0, q6p6)),
+                    _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
+                            _mm_subs_epu8(q0p0, q7p7)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+            flat2 = _mm_subs_epu8(flat2, one);
+            flat2 = _mm_cmpeq_epi8(flat2, zero);
+            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+        }
+
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        // flat and wide flat calculations
+        {
+            const __m128i eight = _mm_set1_epi16(8);
+            const __m128i four = _mm_set1_epi16(4);
+            __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+            __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+            __m128i pixelFilter_p, pixelFilter_q;
+            __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+            __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+            p7_16 = _mm_unpacklo_epi8(q7p7, zero);
+            p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+            p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+            p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+            p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+            p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+            p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+            p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+            q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+            q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+            q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+            q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+            q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+            q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+            q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+            q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+            pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+                    _mm_add_epi16(p4_16, p3_16));
+            pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+                    _mm_add_epi16(q4_16, q3_16));
+
+            pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
+                    _mm_add_epi16(p2_16, p1_16));
+            pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+            pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
+                    _mm_add_epi16(q2_16, q1_16));
+            pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+            pixelFilter_p = _mm_add_epi16(eight,
+                    _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+            pixetFilter_p2p1p0 = _mm_add_epi16(four,
+                    _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
+                    4);
+            flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(p3_16, p0_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(q3_16, q0_16)), 3);
+
+            flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(p7_16, p7_16);
+            sum_q7 = _mm_add_epi16(q7_16, q7_16);
+            sum_p3 = _mm_add_epi16(p3_16, p3_16);
+            sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
+                    4);
+            flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(sum_p3, p1_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_q2q1q0,
+                            _mm_add_epi16(sum_q3, q1_16)), 3);
+            flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+            sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
+                    4);
+            flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(sum_p3, p2_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_q2q1q0,
+                            _mm_add_epi16(sum_q3, q2_16)), 3);
+            flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
+                    4);
+            flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
+                    4);
+            flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
+                    4);
+            flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
+                    4);
+            flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
+        }
+        // wide flat
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+        flat = _mm_shuffle_epi32(flat, 68);
+        flat2 = _mm_shuffle_epi32(flat2, 68);
+
+        q2p2 = _mm_andnot_si128(flat, q2p2);
+        flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+        q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+        qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+        flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+        q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+        qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+        flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+        q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+        q6p6 = _mm_andnot_si128(flat2, q6p6);
+        flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+        q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+        _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
+        _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
+
+        q5p5 = _mm_andnot_si128(flat2, q5p5);
+        flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+        q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+        _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
+        _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
+
+        q4p4 = _mm_andnot_si128(flat2, q4p4);
+        flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+        q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+        _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
+        _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
+
+        q3p3 = _mm_andnot_si128(flat2, q3p3);
+        flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+        q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+        _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
+        _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
+
+        q2p2 = _mm_andnot_si128(flat2, q2p2);
+        flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+        q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+        _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
+        _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
+
+        q1p1 = _mm_andnot_si128(flat2, q1p1);
+        flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+        q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+        _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
+        _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
+
+        q0p0 = _mm_andnot_si128(flat2, q0p0);
+        flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+        q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+        _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
+        _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
+    }
+}
+
+static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh) {
+    __m128i mask, hev, flat, flat2;
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i p7, p6, p5;
+    __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+    __m128i q5, q6, q7;
+
+    const __m128i thresh = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _thresh[0]));
+    const __m128i limit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _limit[0]));
+    const __m128i blimit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _blimit[0]));
+
+    p4 = _mm_loadu_si128((__m128i *) (s - 5 * p));
+    p3 = _mm_loadu_si128((__m128i *) (s - 4 * p));
+    p2 = _mm_loadu_si128((__m128i *) (s - 3 * p));
+    p1 = _mm_loadu_si128((__m128i *) (s - 2 * p));
+    p0 = _mm_loadu_si128((__m128i *) (s - 1 * p));
+    q0 = _mm_loadu_si128((__m128i *) (s - 0 * p));
+    q1 = _mm_loadu_si128((__m128i *) (s + 1 * p));
+    q2 = _mm_loadu_si128((__m128i *) (s + 2 * p));
+    q3 = _mm_loadu_si128((__m128i *) (s + 3 * p));
+    q4 = _mm_loadu_si128((__m128i *) (s + 4 * p));
+
+    {
+        const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                _mm_subs_epu8(p0, p1));
+        const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                _mm_subs_epu8(q0, q1));
+        const __m128i fe = _mm_set1_epi8(0xfe);
+        const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+        __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                _mm_subs_epu8(q0, p0));
+        __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                _mm_subs_epu8(q1, p1));
+        __m128i work;
+        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+        hev = _mm_subs_epu8(flat, thresh);
+        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+        mask = _mm_max_epu8(flat, mask);
+        // mask |= (abs(p1 - p0) > limit) * -1;
+        // mask |= (abs(q1 - q0) > limit) * -1;
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+                _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+        mask = _mm_max_epu8(work, mask);
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+                _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+        mask = _mm_max_epu8(work, mask);
+        mask = _mm_subs_epu8(mask, limit);
+        mask = _mm_cmpeq_epi8(mask, zero);
+    }
+
+    // lp filter
+    {
+        const __m128i t4 = _mm_set1_epi8(4);
+        const __m128i t3 = _mm_set1_epi8(3);
+        const __m128i t80 = _mm_set1_epi8(0x80);
+        const __m128i te0 = _mm_set1_epi8(0xe0);
+        const __m128i t1f = _mm_set1_epi8(0x1f);
+        const __m128i t1 = _mm_set1_epi8(0x1);
+        const __m128i t7f = _mm_set1_epi8(0x7f);
+
+        __m128i ps1 = _mm_xor_si128(p1, t80);
+        __m128i ps0 = _mm_xor_si128(p0, t80);
+        __m128i qs0 = _mm_xor_si128(q0, t80);
+        __m128i qs1 = _mm_xor_si128(q1, t80);
+        __m128i filt;
+        __m128i work_a;
+        __m128i filter1, filter2;
+        __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
+                flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
+                flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
+                flat_q2;
+
+        filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+        work_a = _mm_subs_epi8(qs0, ps0);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        filt = _mm_and_si128(filt, mask);
+
+        filter1 = _mm_adds_epi8(filt, t4);
+        filter2 = _mm_adds_epi8(filt, t3);
+
+        /* Filter1 >> 3 */
+        work_a = _mm_cmpgt_epi8(zero, filter1);
+        filter1 = _mm_srli_epi16(filter1, 3);
+        work_a = _mm_and_si128(work_a, te0);
+        filter1 = _mm_and_si128(filter1, t1f);
+        filter1 = _mm_or_si128(filter1, work_a);
+        qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+        /* Filter2 >> 3 */
+        work_a = _mm_cmpgt_epi8(zero, filter2);
+        filter2 = _mm_srli_epi16(filter2, 3);
+        work_a = _mm_and_si128(work_a, te0);
+        filter2 = _mm_and_si128(filter2, t1f);
+        filter2 = _mm_or_si128(filter2, work_a);
+        ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+        /* filt >> 1 */
+        filt = _mm_adds_epi8(filter1, t1);
+        work_a = _mm_cmpgt_epi8(zero, filt);
+        filt = _mm_srli_epi16(filt, 1);
+        work_a = _mm_and_si128(work_a, t80);
+        filt = _mm_and_si128(filt, t7f);
+        filt = _mm_or_si128(filt, work_a);
+        filt = _mm_andnot_si128(hev, filt);
+        ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+        qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+        // loopfilter done
+
+        {
+            __m128i work;
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+                    _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+            flat = _mm_max_epu8(work, flat);
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+                    _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+            flat = _mm_max_epu8(work, flat);
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
+                    _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
+            flat = _mm_subs_epu8(flat, one);
+            flat = _mm_cmpeq_epi8(flat, zero);
+            flat = _mm_and_si128(flat, mask);
+
+            p5 = _mm_loadu_si128((__m128i *) (s - 6 * p));
+            q5 = _mm_loadu_si128((__m128i *) (s + 5 * p));
+            flat2 = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
+                    _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            p6 = _mm_loadu_si128((__m128i *) (s - 7 * p));
+            q6 = _mm_loadu_si128((__m128i *) (s + 6 * p));
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
+                    _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+
+            p7 = _mm_loadu_si128((__m128i *) (s - 8 * p));
+            q7 = _mm_loadu_si128((__m128i *) (s + 7 * p));
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
+                    _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            flat2 = _mm_subs_epu8(flat2, one);
+            flat2 = _mm_cmpeq_epi8(flat2, zero);
+            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+        }
+
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        // flat and wide flat calculations
+        {
+            const __m256i eight = _mm256_set1_epi16(8);
+            const __m256i four = _mm256_set1_epi16(4);
+            __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
+                    q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
+                    p256_0, q256_0;
+            __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
+                    pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
+                    res_q;
+
+            p256_7 = _mm256_cvtepu8_epi16(p7);
+            p256_6 = _mm256_cvtepu8_epi16(p6);
+            p256_5 = _mm256_cvtepu8_epi16(p5);
+            p256_4 = _mm256_cvtepu8_epi16(p4);
+            p256_3 = _mm256_cvtepu8_epi16(p3);
+            p256_2 = _mm256_cvtepu8_epi16(p2);
+            p256_1 = _mm256_cvtepu8_epi16(p1);
+            p256_0 = _mm256_cvtepu8_epi16(p0);
+            q256_0 = _mm256_cvtepu8_epi16(q0);
+            q256_1 = _mm256_cvtepu8_epi16(q1);
+            q256_2 = _mm256_cvtepu8_epi16(q2);
+            q256_3 = _mm256_cvtepu8_epi16(q3);
+            q256_4 = _mm256_cvtepu8_epi16(q4);
+            q256_5 = _mm256_cvtepu8_epi16(q5);
+            q256_6 = _mm256_cvtepu8_epi16(q6);
+            q256_7 = _mm256_cvtepu8_epi16(q7);
+
+            pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
+                    _mm256_add_epi16(p256_4, p256_3));
+            pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
+                    _mm256_add_epi16(q256_4, q256_3));
+
+            pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
+                    _mm256_add_epi16(p256_2, p256_1));
+            pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+            pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
+                    _mm256_add_epi16(q256_2, q256_1));
+            pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+
+            pixelFilter_p = _mm256_add_epi16(eight,
+                    _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
+
+            pixetFilter_p2p1p0 = _mm256_add_epi16(four,
+                    _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(p256_7, p256_0)), 4);
+
+            flat2_p0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(q256_7, q256_0)), 4);
+
+            flat2_q0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(p256_3, p256_0)), 3);
+
+            flat_p0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(q256_3, q256_0)), 3);
+
+            flat_q0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(p256_7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(q256_7, q256_7);
+
+            sum_p3 = _mm256_add_epi16(p256_3, p256_3);
+
+            sum_q3 = _mm256_add_epi16(q256_3, q256_3);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_1)), 4);
+
+            flat2_p1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_1)), 4);
+
+            flat2_q1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
+
+            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(sum_p3, p256_1)), 3);
+
+            flat_p1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_q2q1q0,
+                            _mm256_add_epi16(sum_q3, q256_1)), 3);
+
+            flat_q1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
+
+            sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_2)), 4);
+
+            flat2_p2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_2)), 4);
+
+            flat2_q2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
+
+            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(sum_p3, p256_2)), 3);
+
+            flat_p2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_q2q1q0,
+                            _mm256_add_epi16(sum_q3, q256_2)), 3);
+
+            flat_q2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_3)), 4);
+
+            flat2_p3 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_3)), 4);
+
+            flat2_q3 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_4)), 4);
+
+            flat2_p4 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_4)), 4);
+
+            flat2_q4 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_5)), 4);
+
+            flat2_p5 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_5)), 4);
+
+            flat2_q5 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_6)), 4);
+
+            flat2_p6 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_6)), 4);
+
+            flat2_q6 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+        }
+
+        // wide flat
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+        p2 = _mm_andnot_si128(flat, p2);
+        flat_p2 = _mm_and_si128(flat, flat_p2);
+        p2 = _mm_or_si128(flat_p2, p2);
+
+        p1 = _mm_andnot_si128(flat, ps1);
+        flat_p1 = _mm_and_si128(flat, flat_p1);
+        p1 = _mm_or_si128(flat_p1, p1);
+
+        p0 = _mm_andnot_si128(flat, ps0);
+        flat_p0 = _mm_and_si128(flat, flat_p0);
+        p0 = _mm_or_si128(flat_p0, p0);
+
+        q0 = _mm_andnot_si128(flat, qs0);
+        flat_q0 = _mm_and_si128(flat, flat_q0);
+        q0 = _mm_or_si128(flat_q0, q0);
+
+        q1 = _mm_andnot_si128(flat, qs1);
+        flat_q1 = _mm_and_si128(flat, flat_q1);
+        q1 = _mm_or_si128(flat_q1, q1);
+
+        q2 = _mm_andnot_si128(flat, q2);
+        flat_q2 = _mm_and_si128(flat, flat_q2);
+        q2 = _mm_or_si128(flat_q2, q2);
+
+        p6 = _mm_andnot_si128(flat2, p6);
+        flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+        p6 = _mm_or_si128(flat2_p6, p6);
+        _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
+
+        p5 = _mm_andnot_si128(flat2, p5);
+        flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+        p5 = _mm_or_si128(flat2_p5, p5);
+        _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
+
+        p4 = _mm_andnot_si128(flat2, p4);
+        flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+        p4 = _mm_or_si128(flat2_p4, p4);
+        _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
+
+        p3 = _mm_andnot_si128(flat2, p3);
+        flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+        p3 = _mm_or_si128(flat2_p3, p3);
+        _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
+
+        p2 = _mm_andnot_si128(flat2, p2);
+        flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+        p2 = _mm_or_si128(flat2_p2, p2);
+        _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
+
+        p1 = _mm_andnot_si128(flat2, p1);
+        flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+        p1 = _mm_or_si128(flat2_p1, p1);
+        _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
+
+        p0 = _mm_andnot_si128(flat2, p0);
+        flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+        p0 = _mm_or_si128(flat2_p0, p0);
+        _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
+
+        q0 = _mm_andnot_si128(flat2, q0);
+        flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+        q0 = _mm_or_si128(flat2_q0, q0);
+        _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
+
+        q1 = _mm_andnot_si128(flat2, q1);
+        flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+        q1 = _mm_or_si128(flat2_q1, q1);
+        _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
+
+        q2 = _mm_andnot_si128(flat2, q2);
+        flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+        q2 = _mm_or_si128(flat2_q2, q2);
+        _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
+
+        q3 = _mm_andnot_si128(flat2, q3);
+        flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+        q3 = _mm_or_si128(flat2_q3, q3);
+        _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
+
+        q4 = _mm_andnot_si128(flat2, q4);
+        flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+        q4 = _mm_or_si128(flat2_q4, q4);
+        _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
+
+        q5 = _mm_andnot_si128(flat2, q5);
+        flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+        q5 = _mm_or_si128(flat2_q5, q5);
+        _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
+
+        q6 = _mm_andnot_si128(flat2, q6);
+        flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+        q6 = _mm_or_si128(flat2_q6, q6);
+        _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
+    }
+}
+
+void vp9_mb_lpf_horizontal_edge_w_avx2(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh, int count) {
+    if (count == 1)
+        mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);
+    else
+        mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);
+}

diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 6bc51e8..475a299 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c

@@ -72,7 +72,7 @@
   }
 
   if (!cm->frame_parallel_decoding_mode)
-    update_tx_counts(bsize, context, tx_size, &cm->counts.tx);
+    ++get_tx_counts(bsize, context, &cm->counts.tx)[tx_size];
   return tx_size;
 }
 

diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 779368d..e3a2b77 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c

@@ -37,6 +37,12 @@
 #include "vp9/decoder/vp9_thread.h"
 #include "vp9/decoder/vp9_treereader.h"
 
+typedef struct TileWorkerData {
+  VP9_COMMON *cm;
+  vp9_reader bit_reader;
+  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+} TileWorkerData;
+
 static int read_be32(const uint8_t *p) {
   return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
 }
@@ -103,7 +109,7 @@
 
 static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
   int i, j;
-  for (j = 0; j < SWITCHABLE_FILTERS + 1; ++j)
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
     for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
       vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
 }
@@ -238,9 +244,8 @@
                               aligned_mi_cols));
 }
 
-static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
-                         TX_SIZE tx_size, void *arg) {
-  MACROBLOCKD* const xd = arg;
+static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
+                                    BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   int16_t* const qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
   const int stride = pd->dst.stride;
@@ -276,7 +281,7 @@
     }
 
     if (eob == 1) {
-      *((int32_t *)qcoeff) = 0;
+      vpx_memset(qcoeff, 0, 2 * sizeof(qcoeff[0]));
     } else {
       if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
         vpx_memset(qcoeff, 0, 4 * (4 << tx_size) * sizeof(qcoeff[0]));
@@ -286,9 +291,19 @@
   }
 }
 
-static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
-                               TX_SIZE tx_size, void *arg) {
-  MACROBLOCKD* const xd = arg;
+struct intra_args {
+  VP9_COMMON *cm;
+  MACROBLOCKD *xd;
+  vp9_reader *r;
+};
+
+static void predict_and_reconstruct_intra_block(int plane, int block,
+                                                BLOCK_SIZE plane_bsize,
+                                                TX_SIZE tx_size, void *arg) {
+  struct intra_args *const args = arg;
+  VP9_COMMON *const cm = args->cm;
+  MACROBLOCKD *const xd = args->xd;
+
   struct macroblockd_plane *const pd = &xd->plane[plane];
   MODE_INFO *const mi = xd->mi_8x8[0];
   const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
@@ -307,25 +322,30 @@
                           b_width_log2(plane_bsize), tx_size, mode,
                           dst, pd->dst.stride, dst, pd->dst.stride);
 
-  if (!mi->mbmi.skip_coeff)
-    decode_block(plane, block, plane_bsize, tx_size, arg);
+  if (!mi->mbmi.skip_coeff) {
+    vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, tx_size,
+                            args->r);
+    inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
+  }
 }
 
-static int decode_tokens(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                         BLOCK_SIZE bsize, vp9_reader *r) {
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+struct inter_args {
+  VP9_COMMON *cm;
+  MACROBLOCKD *xd;
+  vp9_reader *r;
+  int *eobtotal;
+};
 
-  if (mbmi->skip_coeff) {
-    reset_skip_context(xd, bsize);
-    return -1;
-  } else {
-    if (cm->seg.enabled)
-      setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
-                                                  cm->base_qindex));
+static void reconstruct_inter_block(int plane, int block,
+                                    BLOCK_SIZE plane_bsize,
+                                    TX_SIZE tx_size, void *arg) {
+  struct inter_args *args = arg;
+  VP9_COMMON *const cm = args->cm;
+  MACROBLOCKD *const xd = args->xd;
 
-    // TODO(dkovalev) if (!vp9_reader_has_error(r))
-    return vp9_decode_tokens(cm, xd, &cm->seg, r, bsize);
-  }
+  *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block,
+                                             plane_bsize, tx_size, args->r);
+  inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
 }
 
 static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
@@ -376,14 +396,9 @@
 static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                            const TileInfo *const tile,
                            int mi_row, int mi_col,
-                           vp9_reader *r, BLOCK_SIZE bsize, int index) {
+                           vp9_reader *r, BLOCK_SIZE bsize) {
   const int less8x8 = bsize < BLOCK_8X8;
   MB_MODE_INFO *mbmi;
-  int eobtotal;
-
-  if (less8x8)
-    if (index > 0)
-      return;
 
   set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
   vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r);
@@ -393,32 +408,41 @@
 
   // Has to be called after set_offsets
   mbmi = &xd->mi_8x8[0]->mbmi;
-  eobtotal = decode_tokens(cm, xd, bsize, r);
+
+  if (mbmi->skip_coeff) {
+    reset_skip_context(xd, bsize);
+  } else {
+    if (cm->seg.enabled)
+      setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
+                                                  cm->base_qindex));
+  }
 
   if (!is_inter_block(mbmi)) {
-    // Intra reconstruction
-    foreach_transformed_block(xd, bsize, decode_block_intra, xd);
+    struct intra_args arg = { cm, xd, r };
+    foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block,
+                              &arg);
   } else {
-    // Inter reconstruction
-    const int decode_blocks = (eobtotal > 0);
-
-    if (!less8x8) {
-      assert(mbmi->sb_type == bsize);
-      if (eobtotal == 0)
-        mbmi->skip_coeff = 1;  // skip loopfilter
-    }
-
+    // Setup
     set_ref(cm, xd, 0, mi_row, mi_col);
     if (has_second_ref(mbmi))
       set_ref(cm, xd, 1, mi_row, mi_col);
 
     xd->subpix.filter_x = xd->subpix.filter_y =
         vp9_get_filter_kernel(mbmi->interp_filter);
+
+    // Prediction
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
 
-    if (decode_blocks)
-      foreach_transformed_block(xd, bsize, decode_block, xd);
+    // Reconstruction
+    if (!mbmi->skip_coeff) {
+      int eobtotal = 0;
+      struct inter_args arg = { cm, xd, r, &eobtotal };
+      foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
+      if (!less8x8 && eobtotal == 0)
+        mbmi->skip_coeff = 1;  // skip loopfilter
+    }
   }
+
   xd->corrupted |= vp9_reader_has_error(r);
 }
 
@@ -442,54 +466,50 @@
 static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                             const TileInfo *const tile,
                             int mi_row, int mi_col,
-                            vp9_reader* r, BLOCK_SIZE bsize, int index) {
+                            vp9_reader* r, BLOCK_SIZE bsize) {
   const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
-  PARTITION_TYPE partition = PARTITION_NONE;
+  PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
+  int ctx;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  if (bsize < BLOCK_8X8) {
-    if (index > 0)
-      return;
-  } else {
-    const int ctx = partition_plane_context(xd->above_seg_context,
-                                            xd->left_seg_context,
-                                            mi_row, mi_col, bsize);
-    partition = read_partition(hbs, cm->mi_rows, cm->mi_cols, mi_row, mi_col,
-                               cm->fc.partition_prob[cm->frame_type][ctx], r);
+  ctx = partition_plane_context(xd->above_seg_context, xd->left_seg_context,
+                                mi_row, mi_col, bsize);
+  partition = read_partition(hbs, cm->mi_rows, cm->mi_cols, mi_row, mi_col,
+                             cm->fc.partition_prob[cm->frame_type][ctx], r);
 
-    if (!cm->frame_parallel_decoding_mode)
-      ++cm->counts.partition[ctx][partition];
-  }
+  if (!cm->frame_parallel_decoding_mode)
+    ++cm->counts.partition[ctx][partition];
 
   subsize = get_subsize(bsize, partition);
-
-  switch (partition) {
-    case PARTITION_NONE:
-      decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, 0);
-      break;
-    case PARTITION_HORZ:
-      decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, 0);
-      if (mi_row + hbs < cm->mi_rows)
-        decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize, 1);
-      break;
-    case PARTITION_VERT:
-      decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, 0);
-      if (mi_col + hbs < cm->mi_cols)
-        decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize, 1);
-      break;
-    case PARTITION_SPLIT: {
-      int n;
-      for (n = 0; n < 4; n++) {
-        const int j = n >> 1, i = n & 1;
-        decode_modes_sb(cm, xd, tile, mi_row + j * hbs, mi_col + i * hbs,
-                        r, subsize, n);
-      }
-    } break;
-    default:
-      assert(!"Invalid partition type");
+  if (subsize < BLOCK_8X8) {
+    decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
+        break;
+      case PARTITION_HORZ:
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
+        if (mi_row + hbs < cm->mi_rows)
+          decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
+        break;
+      case PARTITION_VERT:
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
+        if (mi_col + hbs < cm->mi_cols)
+          decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
+        break;
+      case PARTITION_SPLIT:
+        decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
+        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
+        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
+        break;
+      default:
+        assert(!"Invalid partition type");
+    }
   }
 
   // update partition context
@@ -774,7 +794,7 @@
     vp9_zero(xd->left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE)
-      decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64, 0);
+      decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64);
 
     if (pbi->do_loopfilter_inline) {
       const int lf_start = mi_row - MI_BLOCK_SIZE;
@@ -823,6 +843,27 @@
     cm->log2_tile_rows += vp9_rb_read_bit(rb);
 }
 
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'.
+static size_t get_tile(const uint8_t *const data_end,
+                       int is_last,
+                       struct vpx_internal_error_info *error_info,
+                       const uint8_t **data) {
+  size_t size;
+
+  if (!is_last) {
+    if (!read_is_valid(*data, 4, data_end))
+      vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
+          "Truncated packet or corrupt tile length");
+
+    size = read_be32(*data);
+    *data += 4;
+  } else {
+    size = data_end - *data;
+  }
+  return size;
+}
+
 static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
   vp9_reader residual_bc;
 
@@ -848,20 +889,15 @@
     const uint8_t *data_ptr2[4][1 << 6];
     vp9_reader bc_bak = {0};
 
-    // pre-initialize the offsets, we're going to read in inverse order
+    // pre-initialize the offsets, we're going to decode in inverse order
     data_ptr2[0][0] = data;
     for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      if (tile_row) {
-        const int size = read_be32(data_ptr2[tile_row - 1][tile_cols - 1]);
-        data_ptr2[tile_row - 1][tile_cols - 1] += 4;
-        data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][tile_cols - 1] + size;
-      }
-
-      for (tile_col = 1; tile_col < tile_cols; tile_col++) {
-        const int size = read_be32(data_ptr2[tile_row][tile_col - 1]);
-        data_ptr2[tile_row][tile_col - 1] += 4;
-        data_ptr2[tile_row][tile_col] =
-            data_ptr2[tile_row][tile_col - 1] + size;
+      for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+        const int last_tile =
+            tile_row == tile_rows - 1 && tile_col == tile_cols - 1;
+        const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
+        data_ptr2[tile_row][tile_col] = data;
+        data += size;
       }
     }
 
@@ -881,27 +917,15 @@
     }
     residual_bc = bc_bak;
   } else {
-    int has_more;
-
     for (tile_row = 0; tile_row < tile_rows; tile_row++) {
       for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+        const int last_tile =
+            tile_row == tile_rows - 1 && tile_col == tile_cols - 1;
+        const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
         TileInfo tile;
-        size_t size;
 
         vp9_tile_init(&tile, cm, tile_row, tile_col);
 
-        has_more = tile_col < tile_cols - 1 || tile_row < tile_rows - 1;
-        if (has_more) {
-          if (!read_is_valid(data, 4, data_end))
-            vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Truncated packet or corrupt tile length");
-
-          size = read_be32(data);
-          data += 4;
-        } else {
-          size = data_end - data;
-        }
-
         setup_token_decoder(data, data_end, size, &cm->error, &residual_bc);
         setup_tile_context(pbi, xd, tile_col);
         decode_tile(pbi, &tile, &residual_bc);
@@ -913,6 +937,107 @@
   return vp9_reader_find_end(&residual_bc);
 }
 
+static int tile_worker_hook(void *arg1, void *arg2) {
+  TileWorkerData *const tile_data = (TileWorkerData*)arg1;
+  const TileInfo *const tile = (TileInfo*)arg2;
+  int mi_row, mi_col;
+
+  for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
+       mi_row += MI_BLOCK_SIZE) {
+    vp9_zero(tile_data->xd.left_context);
+    vp9_zero(tile_data->xd.left_seg_context);
+    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+         mi_col += MI_BLOCK_SIZE)
+      decode_modes_sb(tile_data->cm, &tile_data->xd, tile,
+                      mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64);
+  }
+  return !tile_data->xd.corrupted;
+}
+
+static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) {
+  VP9_COMMON *const cm = &pbi->common;
+  const uint8_t *const data_end = pbi->source + pbi->source_sz;
+  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int num_workers = MIN(pbi->oxcf.max_threads & ~1, tile_cols);
+  int tile_col = 0;
+
+  assert(tile_rows == 1);
+  (void)tile_rows;
+
+  if (num_workers > pbi->num_tile_workers) {
+    int i;
+    CHECK_MEM_ERROR(cm, pbi->tile_workers,
+                    vpx_realloc(pbi->tile_workers,
+                                num_workers * sizeof(*pbi->tile_workers)));
+    for (i = pbi->num_tile_workers; i < num_workers; ++i) {
+      VP9Worker *const worker = &pbi->tile_workers[i];
+      ++pbi->num_tile_workers;
+
+      vp9_worker_init(worker);
+      worker->hook = (VP9WorkerHook)tile_worker_hook;
+      CHECK_MEM_ERROR(cm, worker->data1,
+                      vpx_memalign(32, sizeof(TileWorkerData)));
+      CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
+      if (i < num_workers - 1 && !vp9_worker_reset(worker)) {
+        vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                           "Tile decoder thread creation failed");
+      }
+    }
+  }
+
+  // Note: this memset assumes above_context[0], [1] and [2]
+  // are allocated as part of the same buffer.
+  vpx_memset(pbi->above_context[0], 0,
+             sizeof(*pbi->above_context[0]) * MAX_MB_PLANE *
+             2 * aligned_mi_cols);
+  vpx_memset(pbi->above_seg_context, 0,
+             sizeof(*pbi->above_seg_context) * aligned_mi_cols);
+
+  while (tile_col < tile_cols) {
+    int i;
+    for (i = 0; i < num_workers && tile_col < tile_cols; ++i) {
+      VP9Worker *const worker = &pbi->tile_workers[i];
+      TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
+      TileInfo *const tile = (TileInfo*)worker->data2;
+      const size_t size =
+          get_tile(data_end, tile_col == tile_cols - 1, &cm->error, &data);
+
+      tile_data->cm = cm;
+      tile_data->xd = pbi->mb;
+      tile_data->xd.corrupted = 0;
+      vp9_tile_init(tile, tile_data->cm, 0, tile_col);
+
+      setup_token_decoder(data, data_end, size, &cm->error,
+                          &tile_data->bit_reader);
+      setup_tile_context(pbi, &tile_data->xd, tile_col);
+
+      worker->had_error = 0;
+      if (i == num_workers - 1 || tile_col == tile_cols - 1) {
+        vp9_worker_execute(worker);
+      } else {
+        vp9_worker_launch(worker);
+      }
+
+      data += size;
+      ++tile_col;
+    }
+
+    for (; i > 0; --i) {
+      VP9Worker *const worker = &pbi->tile_workers[i - 1];
+      pbi->mb.corrupted |= !vp9_worker_sync(worker);
+    }
+  }
+
+  {
+    const int final_worker = (tile_cols + num_workers - 1) % num_workers;
+    TileWorkerData *const tile_data =
+        (TileWorkerData*)pbi->tile_workers[final_worker].data1;
+    return vp9_reader_find_end(&tile_data->bit_reader);
+  }
+}
+
 static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
   if (vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_0 ||
       vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_1 ||
@@ -1041,7 +1166,11 @@
   setup_tile_info(cm, rb);
   sz = vp9_rb_read_literal(rb, 16);
 
-  return sz > 0 ? sz : -1;
+  if (sz == 0)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid header size");
+
+  return sz;
 }
 
 static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
@@ -1148,39 +1277,31 @@
   MACROBLOCKD *const xd = &pbi->mb;
 
   const uint8_t *data = pbi->source;
-  const uint8_t *data_end = pbi->source + pbi->source_sz;
+  const uint8_t *const data_end = pbi->source + pbi->source_sz;
 
-  struct vp9_read_bit_buffer rb = { data, data_end, 0,
-                                    cm, error_handler };
+  struct vp9_read_bit_buffer rb = { data, data_end, 0, cm, error_handler };
   const size_t first_partition_size = read_uncompressed_header(pbi, &rb);
   const int keyframe = cm->frame_type == KEY_FRAME;
-  YV12_BUFFER_CONFIG *new_fb = get_frame_new_buffer(cm);
+  const int tile_rows = 1 << cm->log2_tile_rows;
   const int tile_cols = 1 << cm->log2_tile_cols;
+  YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
 
   if (!first_partition_size) {
-    if (!keyframe) {
       // showing a frame directly
       *p_data_end = data + 1;
       return 0;
-    } else {
-      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Invalid key frame");
-      return -1;
-    }
   }
-  data += vp9_rb_bytes_read(&rb);
-  xd->corrupted = 0;
-  new_fb->corrupted = 0;
-  pbi->do_loopfilter_inline =
-      (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level;
 
   if (!pbi->decoded_key_frame && !keyframe)
     return -1;
 
+  data += vp9_rb_bytes_read(&rb);
   if (!read_is_valid(data, first_partition_size, data_end))
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt header length");
 
+  pbi->do_loopfilter_inline =
+      (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level;
   if (pbi->do_loopfilter_inline && pbi->lf_worker.data1 == NULL) {
     CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, vpx_malloc(sizeof(LFWorkerData)));
     pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
@@ -1190,28 +1311,31 @@
     }
   }
 
-  setup_plane_dequants(cm, &pbi->mb, cm->base_qindex);
+  alloc_tile_storage(pbi, tile_cols);
 
   xd->mi_8x8 = cm->mi_grid_visible;
   xd->mode_info_stride = cm->mode_info_stride;
+  set_prev_mi(cm);
 
-  alloc_tile_storage(pbi, tile_cols);
-
-  cm->fc = cm->frame_contexts[cm->frame_context_idx];
-
-  vp9_zero(cm->counts);
-
-  new_fb->corrupted |= read_compressed_header(pbi, data, first_partition_size);
-
+  setup_plane_dequants(cm, xd, cm->base_qindex);
   setup_block_dptrs(xd, cm->subsampling_x, cm->subsampling_y);
 
-  // clear out the coeff buffer
+  cm->fc = cm->frame_contexts[cm->frame_context_idx];
+  vp9_zero(cm->counts);
   for (i = 0; i < MAX_MB_PLANE; ++i)
     vp9_zero(xd->plane[i].qcoeff);
 
-  set_prev_mi(cm);
+  xd->corrupted = 0;
+  new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
 
-  *p_data_end = decode_tiles(pbi, data + first_partition_size);
+  // TODO(jzern): remove frame_parallel_decoding_mode restriction for
+  // single-frame tile decoding.
+  if (pbi->oxcf.max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
+      cm->frame_parallel_decoding_mode) {
+    *p_data_end = decode_tiles_mt(pbi, data + first_partition_size);
+  } else {
+    *p_data_end = decode_tiles(pbi, data + first_partition_size);
+  }
 
   cm->last_width = cm->width;
   cm->last_height = cm->height;

diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 0d0f0df..6ecce28 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c

@@ -210,45 +210,25 @@
   return c;
 }
 
-struct decode_block_args {
-  VP9_COMMON *cm;
-  MACROBLOCKD *xd;
-  struct segmentation *seg;
-  vp9_reader *r;
-  int *eobtotal;
-};
-
-static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
-                         TX_SIZE tx_size, void *argv) {
-  const struct decode_block_args* const arg = argv;
-
-  // find the maximum eob for this transform size, adjusted by segment
-  MACROBLOCKD *xd = arg->xd;
-  const struct segmentation *seg = arg->seg;
-  struct macroblockd_plane* pd = &xd->plane[plane];
-  const int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
-  const int seg_eob = get_tx_eob(seg, segment_id, tx_size);
+int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
+                            int plane, int block, BLOCK_SIZE plane_bsize,
+                            TX_SIZE tx_size, vp9_reader *r) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id,
+                                 tx_size);
   int aoff, loff, eob, pt;
-
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
   pt = get_entropy_context(tx_size, pd->above_context + aoff,
                                     pd->left_context + loff);
 
-  eob = decode_coefs(arg->cm, xd, arg->r, block,
+  eob = decode_coefs(cm, xd, r, block,
                      pd->plane_type, seg_eob, BLOCK_OFFSET(pd->qcoeff, block),
                      tx_size, pd->dequant, pt);
 
   set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff);
 
   pd->eobs[block] = eob;
-  *arg->eobtotal += eob;
+  return eob;
 }
 
-int vp9_decode_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
-                      struct segmentation *seg,
-                      vp9_reader *r, BLOCK_SIZE bsize) {
-  int eobtotal = 0;
-  struct decode_block_args args = {cm, xd, seg, r, &eobtotal};
-  foreach_transformed_block(xd, bsize, decode_block, &args);
-  return eobtotal;
-}
+

diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index 0fb4c3c..94dd8e4 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h

@@ -15,8 +15,8 @@
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/decoder/vp9_dboolhuff.h"
 
-int vp9_decode_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
-                      struct segmentation *seg,
-                      vp9_reader *r, BLOCK_SIZE bsize);
+int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
+                            int plane, int block, BLOCK_SIZE plane_bsize,
+                            TX_SIZE tx_size, vp9_reader *r);
 
 #endif  // VP9_DECODER_VP9_DETOKENIZE_H_

diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index ada73cc..5f970a3 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c

@@ -147,6 +147,7 @@
 }
 
 void vp9_remove_decompressor(VP9D_PTR ptr) {
+  int i;
   VP9D_COMP *const pbi = (VP9D_COMP *)ptr;
 
   if (!pbi)
@@ -155,6 +156,13 @@
   vp9_remove_common(&pbi->common);
   vp9_worker_end(&pbi->lf_worker);
   vpx_free(pbi->lf_worker.data1);
+  for (i = 0; i < pbi->num_tile_workers; ++i) {
+    VP9Worker *const worker = &pbi->tile_workers[i];
+    vp9_worker_end(worker);
+    vpx_free(worker->data1);
+    vpx_free(worker->data2);
+  }
+  vpx_free(pbi->tile_workers);
   vpx_free(pbi->mi_streams);
   vpx_free(pbi->above_context[0]);
   vpx_free(pbi->above_seg_context);

diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h
index 7739952..83ea967 100644
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h

@@ -40,6 +40,9 @@
   int do_loopfilter_inline;  // apply loopfilter to available rows immediately
   VP9Worker lf_worker;
 
+  VP9Worker *tile_workers;
+  int num_tile_workers;
+
   /* Each tile column has its own MODE_INFO stream. This array indexes them by
      tile column index. */
   MODE_INFO **mi_streams;

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index c677907..a996e0e 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c

@@ -53,8 +53,7 @@
 int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZES];
 int64_t tx_count_16x16p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 1];
 int64_t tx_count_8x8p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-int64_t switchable_interp_stats[SWITCHABLE_FILTERS+1]
-                               [SWITCHABLE_FILTERS];
+int64_t switchable_interp_stats[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
 
 void init_tx_count_stats() {
   vp9_zero(tx_count_32x32p_stats);
@@ -87,10 +86,9 @@
 
 static void update_switchable_interp_stats(VP9_COMMON *cm) {
   int i, j;
-  for (i = 0; i < SWITCHABLE_FILTERS+1; ++i)
-    for (j = 0; j < SWITCHABLE_FILTERS; ++j) {
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    for (j = 0; j < SWITCHABLE_FILTERS; ++j)
       switchable_interp_stats[i][j] += cm->fc.switchable_interp_count[i][j];
-    }
 }
 
 void write_tx_count_stats() {
@@ -140,9 +138,9 @@
   fclose(fp);
 
   printf(
-      "vp9_default_switchable_filter_count[SWITCHABLE_FILTERS+1]"
+      "vp9_default_switchable_filter_count[SWITCHABLE_FILTER_CONTEXTS]"
       "[SWITCHABLE_FILTERS] = {\n");
-  for (i = 0; i < SWITCHABLE_FILTERS+1; i++) {
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
     printf("  { ");
     for (j = 0; j < SWITCHABLE_FILTERS; j++) {
       printf("%"PRId64", ", switchable_interp_stats[i][j]);
@@ -236,17 +234,16 @@
 static void update_switchable_interp_probs(VP9_COMP *const cpi,
                                            vp9_writer* const bc) {
   VP9_COMMON *const cm = &cpi->common;
-  unsigned int branch_ct[SWITCHABLE_FILTERS + 1]
-                        [SWITCHABLE_FILTERS - 1][2];
-  vp9_prob new_prob[SWITCHABLE_FILTERS + 1][SWITCHABLE_FILTERS - 1];
+  unsigned int branch_ct[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS - 1][2];
+  vp9_prob new_prob[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS - 1];
   int i, j;
-  for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
     vp9_tree_probs_from_distribution(
         vp9_switchable_interp_tree,
         new_prob[j], branch_ct[j],
         cm->counts.switchable_interp[j], 0);
   }
-  for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
     for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) {
       vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i],
                                 branch_ct[j][i]);
@@ -1142,7 +1139,7 @@
     int i, j, c = 0;
     for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
       count[i] = 0;
-      for (j = 0; j <= SWITCHABLE_FILTERS; ++j)
+      for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
         count[i] += cm->counts.switchable_interp[j][i];
       c += (count[i] > 0);
     }

diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index db2564b..583c6c8 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h

@@ -42,7 +42,7 @@
   int comp_pred_diff;
   int single_pred_diff;
   int64_t tx_rd_diff[TX_MODES];
-  int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
 
   // motion vector cache for adaptive motion search control in partition
   // search loop
@@ -118,8 +118,7 @@
   unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
   int intra_uv_mode_cost[2][MB_MODE_COUNT];
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
-  int switchable_interp_costs[SWITCHABLE_FILTERS + 1]
-                             [SWITCHABLE_FILTERS];
+  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
 
   // These define limits to motion vector components to prevent them
   // from extending outside the UMV borders

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 5ff59a8..44ade18 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -465,7 +465,7 @@
     cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
     cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
 
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
       cpi->rd_filter_diff[i] += ctx->best_filter_diff[i];
   }
 }
@@ -2279,7 +2279,7 @@
       cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
     }
 
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
       const int64_t diff = cpi->rd_filter_diff[i] / cpi->common.MBs;
       cpi->rd_filter_threshes[frame_type][i] =
           (cpi->rd_filter_threshes[frame_type][i] + diff) / 2;
@@ -2493,7 +2493,7 @@
             (mbmi->skip_coeff ||
              vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) {
       const uint8_t context = vp9_get_pred_context_tx_size(xd);
-      update_tx_counts(bsize, context, mbmi->tx_size, &cm->counts.tx);
+      ++get_tx_counts(bsize, context, &cm->counts.tx)[mbmi->tx_size];
     } else {
       int x, y;
       TX_SIZE sz = tx_mode_to_biggest_tx_size[cm->tx_mode];

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 91546e8..e52e8ec 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c

@@ -418,6 +418,7 @@
   struct encode_b_args *const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx *const ctx = args->ctx;
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
                                                        block);
@@ -429,14 +430,18 @@
   // TODO(jingning): per transformed block zero forcing only enabled for
   // luma component. will integrate chroma components as well.
   if (x->zcoeff_blk[tx_size][block] && plane == 0) {
+    int x, y;
     pd->eobs[block] = 0;
+    txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
+    ctx->ta[plane][x] = 0;
+    ctx->tl[plane][y] = 0;
     return;
   }
 
   vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
 
   if (x->optimize)
-    vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
+    vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
 
   if (x->skip_encode || pd->eobs[block] == 0)
     return;

diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c
index b867d8b..7eb6592 100644
--- a/vp9/encoder/vp9_modecosts.c
+++ b/vp9/encoder/vp9_modecosts.c

@@ -36,7 +36,7 @@
                   vp9_kf_uv_mode_prob[INTRA_MODES - 1],
                   vp9_intra_mode_tree);
 
-  for (i = 0; i <= SWITCHABLE_FILTERS; ++i)
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
     vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
                     cm->fc.switchable_interp_prob[i],
                     vp9_switchable_interp_tree);

diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index ad214c7..b664f1e 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c

@@ -2866,7 +2866,7 @@
       cpi->active_best_quality = inter_minq[q];
       // 1-pass: for now, use the average Q for the active_best, if its lower
       // than active_worst.
-      if (cpi->pass == 0 && (cpi->avg_frame_qindex < cpi->active_worst_quality))
+      if (cpi->pass == 0 && (cpi->avg_frame_qindex < q))
         cpi->active_best_quality = inter_minq[cpi->avg_frame_qindex];
 #endif
 
@@ -2902,7 +2902,14 @@
   if (cm->frame_type == KEY_FRAME && !cpi->this_key_frame_forced) {
     *top_index =
       (cpi->active_worst_quality + cpi->active_best_quality * 3) / 4;
+    // If this is the first (key) frame in 1-pass, active best is the user
+    // best-allowed, and leave the top_index to active_worst.
+    if (cpi->pass == 0 && cpi->common.current_video_frame == 0) {
+      cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+      *top_index = cpi->oxcf.worst_allowed_q;
+    }
   } else if (!cpi->is_src_frame_alt_ref &&
+             (cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER) &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
     *top_index =
       (cpi->active_worst_quality + cpi->active_best_quality) / 2;

diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 20831be..0498043 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h

@@ -396,9 +396,9 @@
   // FIXME(rbultje) can this overflow?
   int rd_tx_select_threshes[4][TX_MODES];
 
-  int64_t rd_filter_diff[SWITCHABLE_FILTERS + 1];
-  int64_t rd_filter_threshes[4][SWITCHABLE_FILTERS + 1];
-  int64_t rd_filter_cache[SWITCHABLE_FILTERS + 1];
+  int64_t rd_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t rd_filter_threshes[4][SWITCHABLE_FILTER_CONTEXTS];
+  int64_t rd_filter_cache[SWITCHABLE_FILTER_CONTEXTS];
 
   int RDMULT;
   int RDDIV;
@@ -641,7 +641,7 @@
 
   int dummy_packing;    /* flag to indicate if packing is dummy */
 
-  unsigned int switchable_interp_count[SWITCHABLE_FILTERS + 1]
+  unsigned int switchable_interp_count[SWITCHABLE_FILTER_CONTEXTS]
                                       [SWITCHABLE_FILTERS];
 
   unsigned int tx_stepdown_count[TX_SIZES];

diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 7ad8d1f..fca7525 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c

@@ -22,12 +22,14 @@
 extern int enc_debug;
 #endif
 
-void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-                      int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
-                      int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
-                      int16_t *dqcoeff_ptr, int16_t *dequant_ptr,
-                      int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan,
-                      const int16_t *iscan) {
+void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block,
+                      const int16_t *zbin_ptr, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr,
+                      int zbin_oq_value, uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan) {
   int i, rc, eob;
   int zbins[2], nzbins[2], zbin;
   int x, y, z, sz;
@@ -86,14 +88,15 @@
   *eob_ptr = eob + 1;
 }
 
-void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
                             int skip_block,
-                            int16_t *zbin_ptr, int16_t *round_ptr,
-                            int16_t *quant_ptr, int16_t *quant_shift_ptr,
+                            const int16_t *zbin_ptr, const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
                             int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
-                            int16_t *dequant_ptr, int zbin_oq_value,
-                            uint16_t *eob_ptr, const int16_t *scan,
-                            const int16_t *iscan) {
+                            const int16_t *dequant_ptr,
+                            int zbin_oq_value, uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
   int i, rc, eob;
   int zbins[2], nzbins[2];
   int x, y, z, sz;
@@ -174,25 +177,19 @@
   return res;
 }
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
-                                int y_blocks) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+                                const int16_t *scan, const int16_t *iscan) {
+  MACROBLOCKD *const xd = &x->e_mbd;
   const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
-  const int16_t *scan = get_scan_4x4(tx_type);
-  const int16_t *iscan = get_iscan_4x4(tx_type);
+  struct macroblock_plane* p = &x->plane[pb_idx.plane];
+  struct macroblockd_plane* pd = &xd->plane[pb_idx.plane];
 
-  vp9_quantize_b(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block),
-           16, mb->skip_block,
-           mb->plane[pb_idx.plane].zbin,
-           mb->plane[pb_idx.plane].round,
-           mb->plane[pb_idx.plane].quant,
-           mb->plane[pb_idx.plane].quant_shift,
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block),
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block),
-           xd->plane[pb_idx.plane].dequant,
-           mb->plane[pb_idx.plane].zbin_extra,
-           &xd->plane[pb_idx.plane].eobs[pb_idx.block],
-           scan, iscan);
+  vp9_quantize_b(BLOCK_OFFSET(p->coeff, pb_idx.block),
+           16, x->skip_block,
+           p->zbin, p->round, p->quant, p->quant_shift,
+           BLOCK_OFFSET(pd->qcoeff, pb_idx.block),
+           BLOCK_OFFSET(pd->dqcoeff, pb_idx.block),
+           pd->dequant, p->zbin_extra, &pd->eobs[pb_idx.block], scan, iscan);
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {

diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 459aa33..c078e1d 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h

@@ -13,8 +13,9 @@
 
 #include "vp9/encoder/vp9_block.h"
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
-                                int y_blocks);
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+                                const int16_t *scan, const int16_t *iscan);
+
 struct VP9_COMP;
 
 void vp9_set_quantizer(struct VP9_COMP *cpi, int q);

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index e4be046..f69b19b 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -611,7 +611,7 @@
   // TODO(jingning): temporarily enabled only for luma component
   rd = MIN(rd1, rd2);
   if (plane == 0)
-    x->zcoeff_blk[tx_size][block] = rd1 > rd2;
+    x->zcoeff_blk[tx_size][block] = rd1 > rd2 || !xd->plane[plane].eobs[block];
 
   args->this_rate += args->rate;
   args->this_dist += args->dist;
@@ -1032,10 +1032,10 @@
 
   ENTROPY_CONTEXT ta[2], tempa[2];
   ENTROPY_CONTEXT tl[2], templ[2];
-  TX_TYPE tx_type = DCT_DCT;
+
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
-  int idx, idy, block;
+  int idx, idy;
   uint8_t best_dst[8 * 8];
 
   assert(ib < 4);
@@ -1071,8 +1071,8 @@
         const int16_t *nb;
         uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
         uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
-
-        block = ib + idy * 2 + idx;
+        const int block = ib + idy * 2 + idx;
+        TX_TYPE tx_type;
         xd->mi_8x8[0]->bmi[block].as_mode = mode;
         src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
         coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
@@ -1086,13 +1086,15 @@
                            dst, dst_stride);
 
         tx_type = get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block);
+        get_scan_nb_4x4(tx_type, &scan, &nb);
+
         if (tx_type != DCT_DCT)
           vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
         else
           x->fwd_txm4x4(src_diff, coeff, 8);
-        vp9_regular_quantize_b_4x4(x, block, tx_type, 16);
 
-        get_scan_nb_4x4(tx_type, &scan, &nb);
+        vp9_regular_quantize_b_4x4(x, 16, block, scan, get_iscan_4x4(tx_type));
+
         ratey += cost_coeffs(x, 0, block,
                              tempa + idx, templ + idy, TX_4X4, scan, nb);
         distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
@@ -1558,7 +1560,8 @@
       coeff = BLOCK_OFFSET(p->coeff, k);
       x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                     coeff, 8);
-      vp9_regular_quantize_b_4x4(x, k, DCT_DCT, 16);
+      vp9_regular_quantize_b_4x4(x, 16, k, get_scan_4x4(DCT_DCT),
+                                 get_iscan_4x4(DCT_DCT));
       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
                                         16, &ssz);
       thissse += ssz;
@@ -1651,6 +1654,7 @@
   MB_PREDICTION_MODE this_mode;
   MODE_INFO *mi = x->e_mbd.mi_8x8[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
+  struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
   const int label_count = 4;
   int64_t this_segment_rd = 0;
   int label_mv_thresh;
@@ -1665,8 +1669,8 @@
   int subpelmv = 1, have_ref = 0;
   const int has_second_rf = has_second_ref(mbmi);
 
-  vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
-  vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
+  vpx_memcpy(t_above, pd->above_context, sizeof(t_above));
+  vpx_memcpy(t_left, pd->left_context, sizeof(t_left));
 
   v_fn_ptr = &cpi->fn_ptr[bsize];
 
@@ -1744,7 +1748,7 @@
           }
         }
 
-        vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre));
+        vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre));
         vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
                    sizeof(bsi->rdstat[i][mode_idx].ta));
         vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
@@ -1948,6 +1952,13 @@
               ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
             vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
                        sizeof(SEG_RDSTAT));
+            if (num_4x4_blocks_wide > 1)
+              bsi->rdstat[i + 1][mode_idx].eobs =
+                  ref_bsi->rdstat[i + 1][mode_idx].eobs;
+            if (num_4x4_blocks_high > 1)
+              bsi->rdstat[i + 2][mode_idx].eobs =
+                  ref_bsi->rdstat[i + 2][mode_idx].eobs;
+
             if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
               mode_selected = this_mode;
               best_rd = bsi->rdstat[i][mode_idx].brdcost;
@@ -1968,7 +1979,11 @@
           bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
                                             bsi->rdstat[i][mode_idx].brate, 0);
           bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
-          bsi->rdstat[i][mode_idx].eobs = x->e_mbd.plane[0].eobs[i];
+          bsi->rdstat[i][mode_idx].eobs = pd->eobs[i];
+          if (num_4x4_blocks_wide > 1)
+            bsi->rdstat[i + 1][mode_idx].eobs = pd->eobs[i + 1];
+          if (num_4x4_blocks_high > 1)
+            bsi->rdstat[i + 2][mode_idx].eobs = pd->eobs[i + 2];
         }
 
         if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
@@ -2204,7 +2219,7 @@
                          int_mv *second_ref_mv,
                          int64_t comp_pred_diff[NB_PREDICTION_TYPES],
                          int64_t tx_size_diff[TX_MODES],
-                         int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]) {
+                         int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   // Take a snapshot of the coding context so it can be
@@ -2222,7 +2237,7 @@
 
   vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
   vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
-             sizeof(*best_filter_diff) * (SWITCHABLE_FILTERS + 1));
+             sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
 }
 
 static void setup_pred_block(const MACROBLOCKD *xd,
@@ -2268,12 +2283,8 @@
   // set up scaling factors
   scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1];
 
-  scale[frame_type].x_offset_q4 =
-      ROUND_POWER_OF_TWO(mi_col * MI_SIZE * scale[frame_type].sfc->x_scale_fp,
-       REF_SCALE_SHIFT) & 0xf;
-  scale[frame_type].y_offset_q4 =
-      ROUND_POWER_OF_TWO(mi_row * MI_SIZE * scale[frame_type].sfc->y_scale_fp,
-       REF_SCALE_SHIFT) & 0xf;
+  scale[frame_type].sfc->set_scaled_offsets(&scale[frame_type],
+                                            mi_row * MI_SIZE, mi_col * MI_SIZE);
 
   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
   // use the UV scaling factors.
@@ -3116,8 +3127,8 @@
   int64_t best_tx_diff[TX_MODES];
   int64_t best_pred_diff[NB_PREDICTION_TYPES];
   int64_t best_pred_rd[NB_PREDICTION_TYPES];
-  int64_t best_filter_rd[SWITCHABLE_FILTERS + 1];
-  int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode = { 0 };
   int j;
   int mode_index, best_mode_index = 0;
@@ -3155,7 +3166,7 @@
     best_pred_rd[i] = INT64_MAX;
   for (i = 0; i < TX_MODES; i++)
     best_tx_rd[i] = INT64_MAX;
-  for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
     best_filter_rd[i] = INT64_MAX;
   for (i = 0; i < TX_SIZES; i++)
     rate_uv_intra[i] = INT_MAX;
@@ -3542,7 +3553,7 @@
     if (!disable_skip && ref_frame == INTRA_FRAME) {
       for (i = 0; i < NB_PREDICTION_TYPES; ++i)
         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
-      for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
     }
 
@@ -3625,7 +3636,7 @@
         cm->mcomp_filter_type != BILINEAR) {
       int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
                               SWITCHABLE_FILTERS : cm->mcomp_filter_type];
-      for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
         int64_t adj_rd;
         // In cases of poor prediction, filter_cache[] can contain really big
         // values, which actually are bigger than this_rd itself. This can
@@ -3747,7 +3758,7 @@
   }
 
   if (!x->skip) {
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
       if (best_filter_rd[i] == INT64_MAX)
         best_filter_diff[i] = 0;
       else
@@ -3812,8 +3823,8 @@
   int64_t best_tx_diff[TX_MODES];
   int64_t best_pred_diff[NB_PREDICTION_TYPES];
   int64_t best_pred_rd[NB_PREDICTION_TYPES];
-  int64_t best_filter_rd[SWITCHABLE_FILTERS + 1];
-  int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode = { 0 };
   int mode_index, best_mode_index = 0;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
@@ -3835,7 +3846,7 @@
   int best_skip2 = 0;
 
   x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
-  vp9_zero(x->zcoeff_blk);
+  vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
 
   for (i = 0; i < 4; i++) {
     int j;
@@ -3850,7 +3861,7 @@
     best_pred_rd[i] = INT64_MAX;
   for (i = 0; i < TX_MODES; i++)
     best_tx_rd[i] = INT64_MAX;
-  for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
     best_filter_rd[i] = INT64_MAX;
   for (i = 0; i < TX_SIZES; i++)
     rate_uv_intra[i] = INT_MAX;
@@ -4136,8 +4147,10 @@
               tmp_best_sse = total_sse;
               tmp_best_skippable = skippable;
               tmp_best_mbmode = *mbmi;
-              for (i = 0; i < 4; i++)
+              for (i = 0; i < 4; i++) {
                 tmp_best_bmodes[i] = xd->mi_8x8[0]->bmi[i];
+                x->zcoeff_blk[TX_4X4][i] = !xd->plane[0].eobs[i];
+              }
               pred_exists = 1;
               if (switchable_filter_index == 0 &&
                   cpi->sf.use_rd_breakout &&
@@ -4292,7 +4305,7 @@
     if (!disable_skip && ref_frame == INTRA_FRAME) {
       for (i = 0; i < NB_PREDICTION_TYPES; ++i)
         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
-      for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
     }
 
@@ -4370,7 +4383,7 @@
         cm->mcomp_filter_type != BILINEAR) {
       int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
                               SWITCHABLE_FILTERS : cm->mcomp_filter_type];
-      for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
         int64_t adj_rd;
         // In cases of poor prediction, filter_cache[] can contain really big
         // values, which actually are bigger than this_rd itself. This can
@@ -4486,7 +4499,7 @@
   }
 
   if (!x->skip) {
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
       if (best_filter_rd[i] == INT64_MAX)
         best_filter_diff[i] = 0;
       else

diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 11fa2e0..0badb08 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk

@@ -74,6 +74,7 @@
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
+VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 0f12d88..4d39670 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c

@@ -995,8 +995,9 @@
   if (data) {
     int res;
     vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;
-    res = vp9_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,
-                                scalemode.v_scaling_mode);
+    res = vp9_set_internal_size(ctx->cpi,
+                                (VPX_SCALING)scalemode.h_scaling_mode,
+                                (VPX_SCALING)scalemode.v_scaling_mode);
 
     if (!res) {
       return VPX_CODEC_OK;

diff --git a/vpx/vp8.h b/vpx/vp8.h
index 57d3cae..056fa7a 100644
--- a/vpx/vp8.h
+++ b/vpx/vp8.h

@@ -100,14 +100,17 @@
 
 /*!\brief reference frame data struct
  *
- * define the data struct to access vp8 reference frames
+ * Define the data struct to access vp8 reference frames.
  */
-
 typedef struct vpx_ref_frame {
   vpx_ref_frame_type_t  frame_type;   /**< which reference frame */
   vpx_image_t           img;          /**< reference frame data in image format */
 } vpx_ref_frame_t;
 
+/*!\brief VP9 specific reference frame data struct
+ *
+ * Define the data struct to access vp9 reference frames.
+ */
 typedef struct vp9_ref_frame {
   int idx; /**< frame index to get (input) */
   vpx_image_t  img; /**< img structure to populate (output) */
@@ -117,7 +120,6 @@
  *
  * defines the data type for each of VP8 decoder control function requires
  */
-
 VPX_CTRL_USE_TYPE(VP8_SET_REFERENCE,           vpx_ref_frame_t *)
 VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE,          vpx_ref_frame_t *)
 VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC,            vp8_postproc_cfg_t *)
@@ -127,7 +129,6 @@
 VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV,      int)
 VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE,           vp9_ref_frame_t *)
 
-
 /*! @} - end defgroup vp8 */
 
 #ifdef __cplusplus

diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 92fdb00..9f68c38 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h

@@ -7,7 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-
+#ifndef VP8CX_H
+#define VP8CX_H
 
 /*!\defgroup vp8_encoder WebM VP8 Encoder
  * \ingroup vp8
@@ -20,8 +21,6 @@
  * \brief Provides definitions for using the VP8 encoder algorithm within the
  *        vpx Codec Interface.
  */
-#ifndef VP8CX_H
-#define VP8CX_H
 
 #ifdef __cplusplus
 extern "C" {
@@ -223,16 +222,17 @@
  */
 
 typedef struct vpx_roi_map {
-  unsigned char *roi_map;      /**< specify an id between 0 and 3 for each 16x16 region within a frame */
-  unsigned int   rows;         /**< number of rows */
-  unsigned int   cols;         /**< number of cols */
+  /*! An id between 0 and 3 for each 16x16 region within a frame. */
+  unsigned char *roi_map;
+  unsigned int rows;       /**< Number of rows. */
+  unsigned int cols;       /**< Number of columns. */
   // TODO(paulwilkins): broken for VP9 which has 8 segments
   // q and loop filter deltas for each segment
   // (see MAX_MB_SEGMENTS)
-  int     delta_q[4];
-  int     delta_lf[4];
-  // Static breakout threshold for each segment
-  unsigned int   static_threshold[4];
+  int delta_q[4];          /**< Quantizer deltas. */
+  int delta_lf[4];         /**< Loop filter deltas. */
+  /*! Static breakout threshold for each segment. */
+  unsigned int static_threshold[4];
 } vpx_roi_map_t;
 
 /*!\brief  vpx active region map

diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index 50a223f..d3093c4 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h

@@ -45,7 +45,8 @@
 #include "vp8.h"
 
 
-/*!\brief VP8 decoder control functions
+/*!\enum vp8_dec_control_id
+ * \brief VP8 decoder control functions
  *
  * This set of macros define the control functions available for the VP8
  * decoder interface.
@@ -78,12 +79,17 @@
   VP8_DECODER_CTRL_ID_MAX
 };
 
+/*!\brief Structure to hold decryption state
+ *
+ * Defines a structure to hold the decryption state and access function.
+ */
 typedef struct vp8_decrypt_init {
     /** Decrypt n bytes of data from input -> output, using the decrypt_state
      *  passed in VP8D_SET_DECRYPTOR.
      */
     void (*decrypt_cb)(void *decrypt_state, const unsigned char *input,
                        unsigned char *output, int count);
+    /*! Decryption state. */
     void *decrypt_state;
 } vp8_decrypt_init;
 

diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index 2e6f1e7..3ea36d6 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h

@@ -36,12 +36,13 @@
  * Once initialized, the instance is manged using other functions from
  * the vpx_codec_* family.
  */
+#ifndef VPX_CODEC_H
+#define VPX_CODEC_H
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#ifndef VPX_CODEC_H
-#define VPX_CODEC_H
 #include "vpx_integer.h"
 #include "vpx_image.h"
 
@@ -550,9 +551,8 @@
 
   /*!@} - end defgroup cap_xma*/
   /*!@} - end defgroup codec*/
-
-
-#endif
 #ifdef __cplusplus
 }
 #endif
+#endif
+

diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index e7701e5..2dcd024 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h

@@ -7,7 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-
+#ifndef VPX_DECODER_H
+#define VPX_DECODER_H
 
 /*!\defgroup decoder Decoder Algorithm Interface
  * \ingroup codec
@@ -28,8 +29,6 @@
 extern "C" {
 #endif
 
-#ifndef VPX_DECODER_H
-#define VPX_DECODER_H
 #include "vpx_codec.h"
 
   /*!\brief Current ABI version number
@@ -328,9 +327,8 @@
   /*!@} - end defgroup cap_put_slice*/
 
   /*!@} - end defgroup decoder*/
-
-#endif
-
 #ifdef __cplusplus
 }
 #endif
+#endif
+

diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 56fd2d9..56752cf 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h

@@ -7,7 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-
+#ifndef VPX_ENCODER_H
+#define VPX_ENCODER_H
 
 /*!\defgroup encoder Encoder Algorithm Interface
  * \ingroup codec
@@ -28,8 +29,6 @@
 extern "C" {
 #endif
 
-#ifndef VPX_ENCODER_H
-#define VPX_ENCODER_H
 #include "vpx_codec.h"
 
   /*! Temporal Scalability: Maximum length of the sequence defining frame
@@ -930,8 +929,8 @@
 
 
   /*!@} - end defgroup encoder*/
-
-#endif
 #ifdef __cplusplus
 }
 #endif
+#endif
+

diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h
index b009c35..2990583 100644
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h

@@ -88,12 +88,14 @@
 #endif
 #endif /* end others */
 
-#define HAS_MMX   0x01
-#define HAS_SSE   0x02
-#define HAS_SSE2  0x04
-#define HAS_SSE3  0x08
-#define HAS_SSSE3 0x10
-#define HAS_SSE4_1 0x20
+#define HAS_MMX     0x01
+#define HAS_SSE     0x02
+#define HAS_SSE2    0x04
+#define HAS_SSE3    0x08
+#define HAS_SSSE3   0x10
+#define HAS_SSE4_1  0x20
+#define HAS_AVX     0x40
+#define HAS_AVX2    0x80
 #ifndef BIT
 #define BIT(n) (1<<n)
 #endif
@@ -132,12 +134,16 @@
 
   if (reg_edx & BIT(26)) flags |= HAS_SSE2; /* aka wmt */
 
-  if (reg_ecx & BIT(0))  flags |= HAS_SSE3;
+  if (reg_ecx & BIT(0)) flags |= HAS_SSE3;
 
-  if (reg_ecx & BIT(9))  flags |= HAS_SSSE3;
+  if (reg_ecx & BIT(9)) flags |= HAS_SSSE3;
 
   if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
 
+  if (reg_ecx & BIT(28)) flags |= HAS_AVX;
+
+  if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+
   return flags & mask;
 }