Merge "vp9: add multi-threaded tile decoder"

diff --git a/build/make/Makefile b/build/make/Makefile
index 7a25239..030c1b5 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile

@@ -114,6 +114,10 @@
 $(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3
 $(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1
 $(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1
+$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx
+$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx
+$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2
+$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
 
 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")

diff --git a/build/make/configure.sh b/build/make/configure.sh
index c6c8660..83f480a 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh

@@ -1108,6 +1108,18 @@
             soft_enable sse4_1
         fi
 
+        if enabled gcc && ! disabled avx && ! check_cflags -mavx; then
+            RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx "
+        else
+            soft_enable avx
+        fi
+
+        if enabled gcc && ! disabled avx2 && ! check_cflags -mavx2; then
+            RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx2 "
+        else
+            soft_enable avx2
+        fi
+
         case "${AS}" in
             auto|"")
                 which nasm >/dev/null 2>&1 && AS=nasm

diff --git a/build/make/rtcd.sh b/build/make/rtcd.sh
index c531e95..2967b5a 100755
--- a/build/make/rtcd.sh
+++ b/build/make/rtcd.sh

@@ -327,11 +327,11 @@
 require c
 case $arch in
   x86)
-    ALL_ARCHS=$(filter mmx sse sse2 sse3 ssse3 sse4_1)
+    ALL_ARCHS=$(filter mmx sse sse2 sse3 ssse3 sse4_1 avx avx2)
     x86
     ;;
   x86_64)
-    ALL_ARCHS=$(filter mmx sse sse2 sse3 ssse3 sse4_1)
+    ALL_ARCHS=$(filter mmx sse sse2 sse3 ssse3 sse4_1 avx avx2)
     REQUIRES=${REQUIRES:-mmx sse sse2}
     require $(filter $REQUIRES)
     x86

diff --git a/configure b/configure
index 297cec4..621161c 100755
--- a/configure
+++ b/configure

@@ -234,6 +234,8 @@
     sse3
     ssse3
     sse4_1
+    avx
+    avx2
 
     altivec
 "
@@ -422,7 +424,7 @@
     fi
 
     # The write_common_config (config.mk) logic is deferred until after the
-    # recursive calls to configure complete, becuase we want our universal
+    # recursive calls to configure complete, because we want our universal
     # targets to be executed last.
     write_common_config_targets
     enabled universal && echo "FAT_ARCHS=${fat_bin_archs}" >> config.mk
@@ -608,7 +610,12 @@
         check_add_cflags -Wuninitialized
         check_add_cflags -Wunused-variable
         case ${CC} in
-          *clang*) ;;
+          *clang*)
+              # libvpx and/or clang have issues with aliasing:
+              # https://code.google.com/p/webm/issues/detail?id=603
+              # work around them until they are fixed
+              check_add_cflags -fno-strict-aliasing
+          ;;
           *) check_add_cflags -Wunused-but-set-variable ;;
         esac
         enabled extra_warnings || check_add_cflags -Wno-unused-function

diff --git a/test/test_libvpx.cc b/test/test_libvpx.cc
index a4dbca4..80aca98 100644
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc

@@ -45,6 +45,10 @@
     append_gtest_filter(":-SSSE3/*");
   if (!(simd_caps & HAS_SSE4_1))
     append_gtest_filter(":-SSE4_1/*");
+  if (!(simd_caps & HAS_AVX))
+    append_gtest_filter(":-AVX/*");
+  if (!(simd_caps & HAS_AVX2))
+    append_gtest_filter(":-AVX2/*");
 #endif
 
 #if !CONFIG_SHARED

diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index 759d842..35a22c7 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c

@@ -512,15 +512,15 @@
                 else
                 {
                     mbmi->mode =  NEARMV;
-                    vp8_clamp_mv2(&near_mvs[CNT_NEAR], &pbi->mb);
                     mbmi->mv.as_int = near_mvs[CNT_NEAR].as_int;
+                    vp8_clamp_mv2(&mbmi->mv, &pbi->mb);
                 }
             }
             else
             {
                 mbmi->mode =  NEARESTMV;
-                vp8_clamp_mv2(&near_mvs[CNT_NEAREST], &pbi->mb);
                 mbmi->mv.as_int = near_mvs[CNT_NEAREST].as_int;
+                vp8_clamp_mv2(&mbmi->mv, &pbi->mb);
             }
         }
         else

diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 85ac6d2..218e12e 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c

@@ -16,12 +16,6 @@
 
 #include "vp9/common/vp9_seg_common.h"
 
-struct loop_filter_info {
-  const uint8_t *mblim;
-  const uint8_t *lim;
-  const uint8_t *hev_thr;
-};
-
 // This structure holds bit masks for all 8x8 blocks in a 64x64 region.
 // Each 1 bit represents a position in which we want to apply the loop filter.
 // Left_ entries refer to whether we apply a filter on the border to the
@@ -259,8 +253,8 @@
     if (block_inside_limit < 1)
       block_inside_limit = 1;
 
-    vpx_memset(lfi->lim[lvl], block_inside_limit, SIMD_WIDTH);
-    vpx_memset(lfi->mblim[lvl], (2 * (lvl + 2) + block_inside_limit),
+    vpx_memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
+    vpx_memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
                SIMD_WIDTH);
   }
 }
@@ -268,7 +262,7 @@
 void vp9_loop_filter_init(VP9_COMMON *cm) {
   loop_filter_info_n *lfi = &cm->lf_info;
   struct loopfilter *lf = &cm->lf;
-  int i;
+  int lvl;
 
   // init limits for given sharpness
   update_sharpness(lfi, lf->sharpness_level);
@@ -278,8 +272,8 @@
   lf_init_lut(lfi);
 
   // init hev threshold const vectors
-  for (i = 0; i < 4; i++)
-    vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
+  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
+    vpx_memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
 }
 
 void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
@@ -330,16 +324,14 @@
 
 static int build_lfi(const loop_filter_info_n *lfi_n,
                      const MB_MODE_INFO *mbmi,
-                     struct loop_filter_info *lfi) {
+                     const loop_filter_thresh **lfi) {
   const int seg = mbmi->segment_id;
   const int ref = mbmi->ref_frame[0];
   const int mode = lfi_n->mode_lf_lut[mbmi->mode];
   const int filter_level = lfi_n->lvl[seg][ref][mode];
 
   if (filter_level > 0) {
-    lfi->mblim = lfi_n->mblim[filter_level];
-    lfi->lim = lfi_n->lim[filter_level];
-    lfi->hev_thr = lfi_n->hev_thr[filter_level >> 4];
+    *lfi = &lfi_n->lfthr[filter_level];
     return 1;
   } else {
     return 0;
@@ -351,11 +343,13 @@
                                     unsigned int mask_8x8,
                                     unsigned int mask_4x4,
                                     unsigned int mask_4x4_int,
-                                    const struct loop_filter_info *lfi) {
+                                    const loop_filter_thresh **p_lfi) {
   unsigned int mask;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= 1) {
+    const loop_filter_thresh *lfi = *p_lfi;
+
     if (mask & 1) {
       if (mask_16x16 & 1) {
         vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim,
@@ -379,7 +373,7 @@
       vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, 1);
     s += 8;
-    lfi++;
+    p_lfi++;
     mask_16x16 >>= 1;
     mask_8x8 >>= 1;
     mask_4x4 >>= 1;
@@ -393,12 +387,14 @@
                                      unsigned int mask_4x4,
                                      unsigned int mask_4x4_int,
                                      int only_4x4_1,
-                                     const struct loop_filter_info *lfi) {
+                                     const loop_filter_thresh **p_lfi) {
   unsigned int mask;
   int count;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= count) {
+    const loop_filter_thresh *lfi = *p_lfi;
+
     count = 1;
     if (mask & 1) {
       if (!only_4x4_1) {
@@ -432,7 +428,7 @@
                                         lfi->lim, lfi->hev_thr, 1);
     }
     s += 8 * count;
-    lfi += count;
+    p_lfi += count;
     mask_16x16 >>= count;
     mask_8x8 >>= count;
     mask_4x4 >>= count;
@@ -805,7 +801,7 @@
   unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
   unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
   unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
-  struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+  const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
   int r, c;
 
   for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
@@ -834,7 +830,7 @@
       const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
 
       // Filter level can vary per MI
-      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
+      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]))
         continue;
 
       // Build masks based on the transform size of each block
@@ -925,7 +921,7 @@
   struct buf_2d *const dst = &plane->dst;
   uint8_t* const dst0 = dst->buf;
   unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
-  struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+  const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
   int r, c;
   int row_shift = 3 - ss_x;
   int row_mask = 0xff >> (ss_x << 2);
@@ -938,8 +934,8 @@
     // Determine the vertical edges that need filtering
     for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
       const MODE_INFO *mi = mi_8x8[c];
-      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
-        continue;
+
+      build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]);
     }
     if (!plane->plane_type) {
       mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y);

diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index c698090..62389ea 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h

@@ -46,12 +46,13 @@
 // Need to align this structure so when it is declared and
 // passed it can be loaded into vector registers.
 typedef struct {
-  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
-                  mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
-  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
-                  lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
-  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
-                  hev_thr[4][SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]);
+} loop_filter_thresh;
+
+typedef struct {
+  loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
   uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
   uint8_t mode_lf_lut[MB_MODE_COUNT];
 } loop_filter_info_n;

diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index a869dc0..19032bf 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h

@@ -127,14 +127,14 @@
   return get_tx_probs(bsize, context, tx_probs);
 }
 
-static void update_tx_counts(BLOCK_SIZE bsize, uint8_t context,
-                             TX_SIZE tx_size, struct tx_counts *tx_counts) {
-  if (bsize >= BLOCK_32X32)
-    tx_counts->p32x32[context][tx_size]++;
-  else if (bsize >= BLOCK_16X16)
-    tx_counts->p16x16[context][tx_size]++;
+static unsigned int *get_tx_counts(BLOCK_SIZE bsize, uint8_t context,
+                                   struct tx_counts *tx_counts) {
+  if (bsize < BLOCK_16X16)
+    return tx_counts->p8x8[context];
+  else if (bsize < BLOCK_32X32)
+    return tx_counts->p16x16[context];
   else
-    tx_counts->p8x8[context][tx_size]++;
+    return tx_counts->p32x32[context];
 }
 
 #endif  // VP9_COMMON_VP9_PRED_COMMON_H_

diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 2d9fbff..5e049c6 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -22,10 +22,11 @@
 
 # x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly.
 [ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse_x86inc=sse &&
-  sse2_x86inc=sse2 && ssse3_x86inc=ssse3
+  sse2_x86inc=sse2 && ssse3_x86inc=ssse3 && avx_x86inc=avx && avx2_x86inc=avx2
 
 # this variable is for functions that are 64 bit only.
-[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && ssse3_x86_64=ssse3
+[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && 
+  ssse3_x86_64=ssse3 && avx_x86_64=avx && avx2_x86_64=avx2
 
 #
 # RECON
@@ -671,10 +672,10 @@
 prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
 specialize vp9_subtract_block $sse2_x86inc
 
-prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+prototype void vp9_quantize_b "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
 specialize vp9_quantize_b $ssse3_x86_64
 
-prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+prototype void vp9_quantize_b_32x32 "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
 specialize vp9_quantize_b_32x32 $ssse3_x86_64
 
 #

diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 6bc51e8..475a299 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c

@@ -72,7 +72,7 @@
   }
 
   if (!cm->frame_parallel_decoding_mode)
-    update_tx_counts(bsize, context, tx_size, &cm->counts.tx);
+    ++get_tx_counts(bsize, context, &cm->counts.tx)[tx_size];
   return tx_size;
 }
 

diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index f5e4592..12b3f5c 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c

@@ -1151,7 +1151,11 @@
   setup_tile_info(cm, rb);
   sz = vp9_rb_read_literal(rb, 16);
 
-  return sz > 0 ? sz : -1;
+  if (sz == 0)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid header size");
+
+  return sz;
 }
 
 static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
@@ -1268,15 +1272,9 @@
   YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
 
   if (!first_partition_size) {
-    if (!keyframe) {
       // showing a frame directly
       *p_data_end = data + 1;
       return 0;
-    } else {
-      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Invalid key frame");
-      return -1;
-    }
   }
 
   if (!pbi->decoded_key_frame && !keyframe)

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 5ff59a8..9408e54 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -2493,7 +2493,7 @@
             (mbmi->skip_coeff ||
              vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) {
       const uint8_t context = vp9_get_pred_context_tx_size(xd);
-      update_tx_counts(bsize, context, mbmi->tx_size, &cm->counts.tx);
+      ++get_tx_counts(bsize, context, &cm->counts.tx)[mbmi->tx_size];
     } else {
       int x, y;
       TX_SIZE sz = tx_mode_to_biggest_tx_size[cm->tx_mode];

diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 7ad8d1f..fca7525 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c

@@ -22,12 +22,14 @@
 extern int enc_debug;
 #endif
 
-void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-                      int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
-                      int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
-                      int16_t *dqcoeff_ptr, int16_t *dequant_ptr,
-                      int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan,
-                      const int16_t *iscan) {
+void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block,
+                      const int16_t *zbin_ptr, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr,
+                      int zbin_oq_value, uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan) {
   int i, rc, eob;
   int zbins[2], nzbins[2], zbin;
   int x, y, z, sz;
@@ -86,14 +88,15 @@
   *eob_ptr = eob + 1;
 }
 
-void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
                             int skip_block,
-                            int16_t *zbin_ptr, int16_t *round_ptr,
-                            int16_t *quant_ptr, int16_t *quant_shift_ptr,
+                            const int16_t *zbin_ptr, const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
                             int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
-                            int16_t *dequant_ptr, int zbin_oq_value,
-                            uint16_t *eob_ptr, const int16_t *scan,
-                            const int16_t *iscan) {
+                            const int16_t *dequant_ptr,
+                            int zbin_oq_value, uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
   int i, rc, eob;
   int zbins[2], nzbins[2];
   int x, y, z, sz;
@@ -174,25 +177,19 @@
   return res;
 }
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
-                                int y_blocks) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+                                const int16_t *scan, const int16_t *iscan) {
+  MACROBLOCKD *const xd = &x->e_mbd;
   const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
-  const int16_t *scan = get_scan_4x4(tx_type);
-  const int16_t *iscan = get_iscan_4x4(tx_type);
+  struct macroblock_plane* p = &x->plane[pb_idx.plane];
+  struct macroblockd_plane* pd = &xd->plane[pb_idx.plane];
 
-  vp9_quantize_b(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block),
-           16, mb->skip_block,
-           mb->plane[pb_idx.plane].zbin,
-           mb->plane[pb_idx.plane].round,
-           mb->plane[pb_idx.plane].quant,
-           mb->plane[pb_idx.plane].quant_shift,
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block),
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block),
-           xd->plane[pb_idx.plane].dequant,
-           mb->plane[pb_idx.plane].zbin_extra,
-           &xd->plane[pb_idx.plane].eobs[pb_idx.block],
-           scan, iscan);
+  vp9_quantize_b(BLOCK_OFFSET(p->coeff, pb_idx.block),
+           16, x->skip_block,
+           p->zbin, p->round, p->quant, p->quant_shift,
+           BLOCK_OFFSET(pd->qcoeff, pb_idx.block),
+           BLOCK_OFFSET(pd->dqcoeff, pb_idx.block),
+           pd->dequant, p->zbin_extra, &pd->eobs[pb_idx.block], scan, iscan);
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {

diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 459aa33..c078e1d 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h

@@ -13,8 +13,9 @@
 
 #include "vp9/encoder/vp9_block.h"
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
-                                int y_blocks);
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+                                const int16_t *scan, const int16_t *iscan);
+
 struct VP9_COMP;
 
 void vp9_set_quantizer(struct VP9_COMP *cpi, int q);

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index d25112b..c134208 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -1032,10 +1032,10 @@
 
   ENTROPY_CONTEXT ta[2], tempa[2];
   ENTROPY_CONTEXT tl[2], templ[2];
-  TX_TYPE tx_type = DCT_DCT;
+
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
-  int idx, idy, block;
+  int idx, idy;
   uint8_t best_dst[8 * 8];
 
   assert(ib < 4);
@@ -1071,8 +1071,8 @@
         const int16_t *nb;
         uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
         uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
-
-        block = ib + idy * 2 + idx;
+        const int block = ib + idy * 2 + idx;
+        TX_TYPE tx_type;
         xd->mi_8x8[0]->bmi[block].as_mode = mode;
         src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
         coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
@@ -1086,13 +1086,15 @@
                            dst, dst_stride);
 
         tx_type = get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block);
+        get_scan_nb_4x4(tx_type, &scan, &nb);
+
         if (tx_type != DCT_DCT)
           vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
         else
           x->fwd_txm4x4(src_diff, coeff, 8);
-        vp9_regular_quantize_b_4x4(x, block, tx_type, 16);
 
-        get_scan_nb_4x4(tx_type, &scan, &nb);
+        vp9_regular_quantize_b_4x4(x, 16, block, scan, get_iscan_4x4(tx_type));
+
         ratey += cost_coeffs(x, 0, block,
                              tempa + idx, templ + idy, TX_4X4, scan, nb);
         distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
@@ -1558,7 +1560,8 @@
       coeff = BLOCK_OFFSET(p->coeff, k);
       x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                     coeff, 8);
-      vp9_regular_quantize_b_4x4(x, k, DCT_DCT, 16);
+      vp9_regular_quantize_b_4x4(x, 16, k, get_scan_4x4(DCT_DCT),
+                                 get_iscan_4x4(DCT_DCT));
       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
                                         16, &ssz);
       thissse += ssz;

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 0f12d88..4d39670 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c

@@ -995,8 +995,9 @@
   if (data) {
     int res;
     vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;
-    res = vp9_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,
-                                scalemode.v_scaling_mode);
+    res = vp9_set_internal_size(ctx->cpi,
+                                (VPX_SCALING)scalemode.h_scaling_mode,
+                                (VPX_SCALING)scalemode.v_scaling_mode);
 
     if (!res) {
       return VPX_CODEC_OK;

diff --git a/vpx/vp8.h b/vpx/vp8.h
index 57d3cae..056fa7a 100644
--- a/vpx/vp8.h
+++ b/vpx/vp8.h

@@ -100,14 +100,17 @@
 
 /*!\brief reference frame data struct
  *
- * define the data struct to access vp8 reference frames
+ * Define the data struct to access vp8 reference frames.
  */
-
 typedef struct vpx_ref_frame {
   vpx_ref_frame_type_t  frame_type;   /**< which reference frame */
   vpx_image_t           img;          /**< reference frame data in image format */
 } vpx_ref_frame_t;
 
+/*!\brief VP9 specific reference frame data struct
+ *
+ * Define the data struct to access vp9 reference frames.
+ */
 typedef struct vp9_ref_frame {
   int idx; /**< frame index to get (input) */
   vpx_image_t  img; /**< img structure to populate (output) */
@@ -117,7 +120,6 @@
  *
  * defines the data type for each of VP8 decoder control function requires
  */
-
 VPX_CTRL_USE_TYPE(VP8_SET_REFERENCE,           vpx_ref_frame_t *)
 VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE,          vpx_ref_frame_t *)
 VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC,            vp8_postproc_cfg_t *)
@@ -127,7 +129,6 @@
 VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV,      int)
 VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE,           vp9_ref_frame_t *)
 
-
 /*! @} - end defgroup vp8 */
 
 #ifdef __cplusplus

diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 92fdb00..9f68c38 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h

@@ -7,7 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-
+#ifndef VP8CX_H
+#define VP8CX_H
 
 /*!\defgroup vp8_encoder WebM VP8 Encoder
  * \ingroup vp8
@@ -20,8 +21,6 @@
  * \brief Provides definitions for using the VP8 encoder algorithm within the
  *        vpx Codec Interface.
  */
-#ifndef VP8CX_H
-#define VP8CX_H
 
 #ifdef __cplusplus
 extern "C" {
@@ -223,16 +222,17 @@
  */
 
 typedef struct vpx_roi_map {
-  unsigned char *roi_map;      /**< specify an id between 0 and 3 for each 16x16 region within a frame */
-  unsigned int   rows;         /**< number of rows */
-  unsigned int   cols;         /**< number of cols */
+  /*! An id between 0 and 3 for each 16x16 region within a frame. */
+  unsigned char *roi_map;
+  unsigned int rows;       /**< Number of rows. */
+  unsigned int cols;       /**< Number of columns. */
   // TODO(paulwilkins): broken for VP9 which has 8 segments
   // q and loop filter deltas for each segment
   // (see MAX_MB_SEGMENTS)
-  int     delta_q[4];
-  int     delta_lf[4];
-  // Static breakout threshold for each segment
-  unsigned int   static_threshold[4];
+  int delta_q[4];          /**< Quantizer deltas. */
+  int delta_lf[4];         /**< Loop filter deltas. */
+  /*! Static breakout threshold for each segment. */
+  unsigned int static_threshold[4];
 } vpx_roi_map_t;
 
 /*!\brief  vpx active region map

diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index 50a223f..d3093c4 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h

@@ -45,7 +45,8 @@
 #include "vp8.h"
 
 
-/*!\brief VP8 decoder control functions
+/*!\enum vp8_dec_control_id
+ * \brief VP8 decoder control functions
  *
  * This set of macros define the control functions available for the VP8
  * decoder interface.
@@ -78,12 +79,17 @@
   VP8_DECODER_CTRL_ID_MAX
 };
 
+/*!\brief Structure to hold decryption state
+ *
+ * Defines a structure to hold the decryption state and access function.
+ */
 typedef struct vp8_decrypt_init {
     /** Decrypt n bytes of data from input -> output, using the decrypt_state
      *  passed in VP8D_SET_DECRYPTOR.
      */
     void (*decrypt_cb)(void *decrypt_state, const unsigned char *input,
                        unsigned char *output, int count);
+    /*! Decryption state. */
     void *decrypt_state;
 } vp8_decrypt_init;
 

diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index 2e6f1e7..3ea36d6 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h

@@ -36,12 +36,13 @@
  * Once initialized, the instance is manged using other functions from
  * the vpx_codec_* family.
  */
+#ifndef VPX_CODEC_H
+#define VPX_CODEC_H
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#ifndef VPX_CODEC_H
-#define VPX_CODEC_H
 #include "vpx_integer.h"
 #include "vpx_image.h"
 
@@ -550,9 +551,8 @@
 
   /*!@} - end defgroup cap_xma*/
   /*!@} - end defgroup codec*/
-
-
-#endif
 #ifdef __cplusplus
 }
 #endif
+#endif
+

diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index e7701e5..2dcd024 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h

@@ -7,7 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-
+#ifndef VPX_DECODER_H
+#define VPX_DECODER_H
 
 /*!\defgroup decoder Decoder Algorithm Interface
  * \ingroup codec
@@ -28,8 +29,6 @@
 extern "C" {
 #endif
 
-#ifndef VPX_DECODER_H
-#define VPX_DECODER_H
 #include "vpx_codec.h"
 
   /*!\brief Current ABI version number
@@ -328,9 +327,8 @@
   /*!@} - end defgroup cap_put_slice*/
 
   /*!@} - end defgroup decoder*/
-
-#endif
-
 #ifdef __cplusplus
 }
 #endif
+#endif
+

diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 56fd2d9..56752cf 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h

@@ -7,7 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-
+#ifndef VPX_ENCODER_H
+#define VPX_ENCODER_H
 
 /*!\defgroup encoder Encoder Algorithm Interface
  * \ingroup codec
@@ -28,8 +29,6 @@
 extern "C" {
 #endif
 
-#ifndef VPX_ENCODER_H
-#define VPX_ENCODER_H
 #include "vpx_codec.h"
 
   /*! Temporal Scalability: Maximum length of the sequence defining frame
@@ -930,8 +929,8 @@
 
 
   /*!@} - end defgroup encoder*/
-
-#endif
 #ifdef __cplusplus
 }
 #endif
+#endif
+

diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h
index b009c35..2990583 100644
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h

@@ -88,12 +88,14 @@
 #endif
 #endif /* end others */
 
-#define HAS_MMX   0x01
-#define HAS_SSE   0x02
-#define HAS_SSE2  0x04
-#define HAS_SSE3  0x08
-#define HAS_SSSE3 0x10
-#define HAS_SSE4_1 0x20
+#define HAS_MMX     0x01
+#define HAS_SSE     0x02
+#define HAS_SSE2    0x04
+#define HAS_SSE3    0x08
+#define HAS_SSSE3   0x10
+#define HAS_SSE4_1  0x20
+#define HAS_AVX     0x40
+#define HAS_AVX2    0x80
 #ifndef BIT
 #define BIT(n) (1<<n)
 #endif
@@ -132,12 +134,16 @@
 
   if (reg_edx & BIT(26)) flags |= HAS_SSE2; /* aka wmt */
 
-  if (reg_ecx & BIT(0))  flags |= HAS_SSE3;
+  if (reg_ecx & BIT(0)) flags |= HAS_SSE3;
 
-  if (reg_ecx & BIT(9))  flags |= HAS_SSSE3;
+  if (reg_ecx & BIT(9)) flags |= HAS_SSSE3;
 
   if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
 
+  if (reg_ecx & BIT(28)) flags |= HAS_AVX;
+
+  if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+
   return flags & mask;
 }