Merge "vp9_filter: move table alignment decl's to header"

diff --git a/docs.mk b/docs.mk
index 9426f76..797b466 100644
--- a/docs.mk
+++ b/docs.mk

@@ -30,7 +30,9 @@
 
 
 EXAMPLE_PATH += $(SRC_PATH_BARE) #for CHANGELOG, README, etc
+EXAMPLE_PATH += $(SRC_PATH_BARE)/examples
 
+doxyfile: $(if $(findstring examples, $(ALL_TARGETS)),examples.doxy)
 doxyfile: libs.doxy_template libs.doxy
 	@echo "    [CREATE] $@"
 	@cat $^ > $@

diff --git a/examples.mk b/examples.mk
index e4abcf7..40756e1 100644
--- a/examples.mk
+++ b/examples.mk

@@ -285,3 +285,36 @@
                                $(addprefix bin/$(p)/,$(ALL_EXAMPLES_BASENAME:.c=.exe)))
 $(foreach proj,$(call enabled,PROJECTS),\
     $(eval $(call vcproj_template,$(proj))))
+
+#
+# Documentation Rules
+#
+%.dox: %.c
+	@echo "    [DOXY] $@"
+	@echo "/*!\page example_$(@F:.dox=) $(@F:.dox=)" > $@
+	@echo "   \includelineno $(<F)" >> $@
+	@echo "*/" >> $@
+
+samples.dox: examples.mk
+	@echo "    [DOXY] $@"
+	@echo "/*!\page samples Sample Code" > $@
+	@echo "    This SDK includes a number of sample applications."\
+	      "Each sample documents a feature of the SDK in both prose"\
+	      "and the associated C code."\
+	      "The following samples are included: ">>$@
+	@$(foreach ex,$(sort $(notdir $(EXAMPLES:.c=))),\
+	   echo "     - \subpage example_$(ex) $($(ex).DESCRIPTION)" >> $@;)
+	@echo >> $@
+	@echo "    In addition, the SDK contains a number of utilities."\
+              "Since these utilities are built upon the concepts described"\
+              "in the sample code listed above, they are not documented in"\
+              "pieces like the samples are. Their source is included here"\
+              "for reference. The following utilities are included:" >> $@
+	@$(foreach ex,$(sort $(UTILS:.c=)),\
+	   echo "     - \subpage example_$(ex) $($(ex).DESCRIPTION)" >> $@;)
+	@echo "*/" >> $@
+
+CLEAN-OBJS += examples.doxy samples.dox $(ALL_EXAMPLES:.c=.dox)
+DOCS-yes += examples.doxy samples.dox
+examples.doxy: samples.dox $(ALL_EXAMPLES:.c=.dox)
+	@echo "INPUT += $^" > $@

diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon.asm
index 3aa15d2..dc9856f 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.asm
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.asm

@@ -315,8 +315,8 @@
     vdup.u16            q2, r2
     vadd.s16            q1, q1, q3
     vadd.s16            q2, q2, q3
-    vqshrun.s16         d0, q1, #0
-    vqshrun.s16         d1, q2, #0
+    vqmovun.s16         d0, q1
+    vqmovun.s16         d1, q2
     vst1.32             {d0[0]}, [r0], r1
     vst1.32             {d1[0]}, [r0], r1
 
@@ -327,8 +327,8 @@
     vdup.u16            q2, r2
     vadd.s16            q1, q1, q3
     vadd.s16            q2, q2, q3
-    vqshrun.s16         d0, q1, #0
-    vqshrun.s16         d1, q2, #0
+    vqmovun.s16         d0, q1
+    vqmovun.s16         d1, q2
     vst1.32             {d0[0]}, [r0], r1
     vst1.32             {d1[0]}, [r0], r1
     bx                  lr
@@ -372,10 +372,10 @@
     vadd.s16            q8, q3, q8
     vadd.s16            q9, q3, q9
 
-    vqshrun.s16         d0, q0, #0
-    vqshrun.s16         d1, q1, #0
-    vqshrun.s16         d2, q8, #0
-    vqshrun.s16         d3, q9, #0
+    vqmovun.s16         d0, q0
+    vqmovun.s16         d1, q1
+    vqmovun.s16         d2, q8
+    vqmovun.s16         d3, q9
 
     vst1.64             {d0}, [r0], r1
     vst1.64             {d1}, [r0], r1
@@ -394,10 +394,10 @@
     vadd.s16            q8, q3, q8
     vadd.s16            q9, q3, q9
 
-    vqshrun.s16         d0, q0, #0
-    vqshrun.s16         d1, q1, #0
-    vqshrun.s16         d2, q8, #0
-    vqshrun.s16         d3, q9, #0
+    vqmovun.s16         d0, q0
+    vqmovun.s16         d1, q1
+    vqmovun.s16         d2, q8
+    vqmovun.s16         d3, q9
 
     vst1.64             {d0}, [r0], r1
     vst1.64             {d1}, [r0], r1
@@ -445,10 +445,10 @@
     vadd.s16            q0, q0, q3
     vadd.s16            q11, q8, q2
     vadd.s16            q8, q8, q3
-    vqshrun.s16         d2, q1, #0
-    vqshrun.s16         d3, q0, #0
-    vqshrun.s16         d22, q11, #0
-    vqshrun.s16         d23, q8, #0
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
     vdup.16             q0, d20[2]                  ; proload next 2 rows data
     vdup.16             q8, d20[3]
     vst1.64             {d2,d3}, [r0], r1
@@ -459,10 +459,10 @@
     vadd.s16            q0, q0, q3
     vadd.s16            q11, q8, q2
     vadd.s16            q8, q8, q3
-    vqshrun.s16         d2, q1, #0
-    vqshrun.s16         d3, q0, #0
-    vqshrun.s16         d22, q11, #0
-    vqshrun.s16         d23, q8, #0
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
     vdup.16             q0, d21[0]                  ; proload next 2 rows data
     vdup.16             q8, d21[1]
     vst1.64             {d2,d3}, [r0], r1
@@ -472,10 +472,10 @@
     vadd.s16            q0, q0, q3
     vadd.s16            q11, q8, q2
     vadd.s16            q8, q8, q3
-    vqshrun.s16         d2, q1, #0
-    vqshrun.s16         d3, q0, #0
-    vqshrun.s16         d22, q11, #0
-    vqshrun.s16         d23, q8, #0
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
     vdup.16             q0, d21[2]                  ; proload next 2 rows data
     vdup.16             q8, d21[3]
     vst1.64             {d2,d3}, [r0], r1
@@ -486,10 +486,10 @@
     vadd.s16            q0, q0, q3
     vadd.s16            q11, q8, q2
     vadd.s16            q8, q8, q3
-    vqshrun.s16         d2, q1, #0
-    vqshrun.s16         d3, q0, #0
-    vqshrun.s16         d22, q11, #0
-    vqshrun.s16         d23, q8, #0
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
     vld1.8              {d18}, [r3]!                  ; preload 8 left into r12
     vmovl.u8            q10, d18
     vst1.64             {d2,d3}, [r0], r1
@@ -542,19 +542,19 @@
     vadd.s16            q13, q0, q9
     vadd.s16            q14, q0, q10
     vadd.s16            q15, q0, q11
-    vqshrun.s16         d0, q12, #0
-    vqshrun.s16         d1, q13, #0
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
     vadd.s16            q12, q2, q8
     vadd.s16            q13, q2, q9
-    vqshrun.s16         d2, q14, #0
-    vqshrun.s16         d3, q15, #0
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
     vadd.s16            q14, q2, q10
     vadd.s16            q15, q2, q11
     vst1.64             {d0-d3}, [r0], r1
-    vqshrun.s16         d24, q12, #0
-    vqshrun.s16         d25, q13, #0
-    vqshrun.s16         d26, q14, #0
-    vqshrun.s16         d27, q15, #0
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
     vdup.16             q1, d6[2]
     vdup.16             q2, d6[3]
     vst1.64             {d24-d27}, [r0], r1
@@ -564,19 +564,19 @@
     vadd.s16            q13, q1, q9
     vadd.s16            q14, q1, q10
     vadd.s16            q15, q1, q11
-    vqshrun.s16         d0, q12, #0
-    vqshrun.s16         d1, q13, #0
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
     vadd.s16            q12, q2, q8
     vadd.s16            q13, q2, q9
-    vqshrun.s16         d2, q14, #0
-    vqshrun.s16         d3, q15, #0
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
     vadd.s16            q14, q2, q10
     vadd.s16            q15, q2, q11
     vst1.64             {d0-d3}, [r0], r1
-    vqshrun.s16         d24, q12, #0
-    vqshrun.s16         d25, q13, #0
-    vqshrun.s16         d26, q14, #0
-    vqshrun.s16         d27, q15, #0
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
     vdup.16             q0, d7[0]
     vdup.16             q2, d7[1]
     vst1.64             {d24-d27}, [r0], r1
@@ -586,19 +586,19 @@
     vadd.s16            q13, q0, q9
     vadd.s16            q14, q0, q10
     vadd.s16            q15, q0, q11
-    vqshrun.s16         d0, q12, #0
-    vqshrun.s16         d1, q13, #0
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
     vadd.s16            q12, q2, q8
     vadd.s16            q13, q2, q9
-    vqshrun.s16         d2, q14, #0
-    vqshrun.s16         d3, q15, #0
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
     vadd.s16            q14, q2, q10
     vadd.s16            q15, q2, q11
     vst1.64             {d0-d3}, [r0], r1
-    vqshrun.s16         d24, q12, #0
-    vqshrun.s16         d25, q13, #0
-    vqshrun.s16         d26, q14, #0
-    vqshrun.s16         d27, q15, #0
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
     vdup.16             q0, d7[2]
     vdup.16             q2, d7[3]
     vst1.64             {d24-d27}, [r0], r1
@@ -608,20 +608,20 @@
     vadd.s16            q13, q0, q9
     vadd.s16            q14, q0, q10
     vadd.s16            q15, q0, q11
-    vqshrun.s16         d0, q12, #0
-    vqshrun.s16         d1, q13, #0
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
     vadd.s16            q12, q2, q8
     vadd.s16            q13, q2, q9
-    vqshrun.s16         d2, q14, #0
-    vqshrun.s16         d3, q15, #0
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
     vadd.s16            q14, q2, q10
     vadd.s16            q15, q2, q11
     vst1.64             {d0-d3}, [r0], r1
-    vqshrun.s16         d24, q12, #0
-    vqshrun.s16         d25, q13, #0
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
     vld1.8              {d0}, [r3]!                   ; preload 8 left pixels
-    vqshrun.s16         d26, q14, #0
-    vqshrun.s16         d27, q15, #0
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
     vmovl.u8            q3, d0
     vst1.64             {d24-d27}, [r0], r1
 

diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 6317103..a18ae9b 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -386,7 +386,7 @@
 specialize vp9_variance4x4 mmx $sse2_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc
+specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc avx2
 
 prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance64x64 $sse2_x86inc $ssse3_x86inc
@@ -416,7 +416,7 @@
 specialize vp9_sub_pixel_avg_variance16x32 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc
+specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc avx2
 
 prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance32x32 $sse2_x86inc $ssse3_x86inc

diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index a2cf910..1b4904c 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c

@@ -142,20 +142,29 @@
 #if HAVE_AVX2
 filter8_1dfunction vp9_filter_block1d16_v8_avx2;
 filter8_1dfunction vp9_filter_block1d16_h8_avx2;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+#if (ARCH_X86_64)
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
+#else
 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
+#endif
 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
-#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
-#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
 #define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
-#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
 #define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
 #define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
 #define vp9_filter_block1d8_v2_avx2  vp9_filter_block1d8_v2_ssse3
@@ -183,12 +192,26 @@
 FUN_CONV_2D(, avx2);
 #endif
 #if HAVE_SSSE3
+#if (ARCH_X86_64)
+filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
+#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
+#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
+#else
 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#endif
 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;

diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
new file mode 100644
index 0000000..dbea141
--- /dev/null
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c

@@ -0,0 +1,484 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+#include "vpx_ports/mem.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+// filters only for the 4_h8 convolution
+DECLARE_ALIGNED(16, const unsigned char, filt1_4_h8[16])= {
+    0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6};
+
+DECLARE_ALIGNED(16, const unsigned char, filt2_4_h8[16])= {
+    4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10};
+
+// filters for 8_h8 and 16_h8
+DECLARE_ALIGNED(16, const unsigned char, filt1_global[16])= {
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8};
+
+DECLARE_ALIGNED(16, const unsigned char, filt2_global[16])= {
+    2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10};
+
+DECLARE_ALIGNED(16, const unsigned char, filt3_global[16])= {
+    4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12};
+
+DECLARE_ALIGNED(16, const unsigned char, filt4_global[16])= {
+    6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14};
+
+void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
+                                         unsigned int src_pixels_per_line,
+                                         unsigned char *output_ptr,
+                                         unsigned int output_pitch,
+                                         unsigned int output_height,
+                                         int16_t *filter) {
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, srcReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter into the first lane
+  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
+  // duplicate only the third 16 bit in the filter into the first lane
+  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
+  // duplicate only the seconds 16 bits in the filter into the second lane
+  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
+  // duplicate only the forth 16 bits in the filter into the second lane
+  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
+
+  // loading the local filters
+  thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8);
+  forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+    // filter the source buffer
+    srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // extract the higher half of the lane
+    srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
+    srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
+
+    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+
+    // add and saturate all the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+    src_ptr+=src_pixels_per_line;
+
+    // save only 4 bytes
+    *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
+                                         unsigned int src_pixels_per_line,
+                                         unsigned char *output_ptr,
+                                         unsigned int output_pitch,
+                                         unsigned int output_height,
+                                         int16_t *filter) {
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
+  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 128 bit register
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 128 bit register
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+    // filter the source buffer
+    srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
+    srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
+
+    // add and saturate all the results together
+    minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+
+    srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr+=src_pixels_per_line;
+
+    // save only 8 bytes
+    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
+                                          unsigned int src_pixels_per_line,
+                                          unsigned char *output_ptr,
+                                          unsigned int output_pitch,
+                                          unsigned int output_height,
+                                          int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
+  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 128 bit register
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 128 bit register
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+    // filter the source buffer
+    srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                                   _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+
+    // reading the next 16 bytes.
+    // (part of it was being read by earlier read)
+    srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                                   _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+    // filter the source buffer
+    srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+    src_ptr+=src_pixels_per_line;
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
+                                         unsigned int src_pitch,
+                                         unsigned char *output_ptr,
+                                         unsigned int out_pitch,
+                                         unsigned int output_height,
+                                         int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits in the filter
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits in the filter
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits in the filter
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  for (i = 0; i < output_height; i++) {
+    // load the first 8 bytes
+    srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
+    // load the next 8 bytes in stride of src_pitch
+    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
+    srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
+    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
+
+    // merge the result together
+    srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
+    srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
+
+    // load the next 8 bytes in stride of src_pitch
+    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
+    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
+    srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
+    srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
+
+    // merge the result together
+    srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
+    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
+
+    // add and saturate the results together
+    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
+    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr+=src_pitch;
+
+    // save only 8 bytes convolve result
+    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+    output_ptr+=out_pitch;
+  }
+}
+
+void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
+                                          unsigned int src_pitch,
+                                          unsigned char *output_ptr,
+                                          unsigned int out_pitch,
+                                          unsigned int output_height,
+                                          int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits in the filter
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits in the filter
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits in the filter
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  for (i = 0; i < output_height; i++) {
+    // load the first 16 bytes
+    srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
+    // load the next 16 bytes in stride of src_pitch
+    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
+    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
+    srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+
+    // merge the result together
+    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
+    srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
+    srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
+    srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
+    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+
+    // load the next 16 bytes in stride of two/three src_pitch
+    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
+    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
+
+    // merge the result together
+    srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
+    srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
+    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
+
+    // load the next 16 bytes in stride of four/five src_pitch
+    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
+    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
+
+    // merge the result together
+    srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
+    srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
+    srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+                                 _mm_min_epi16(srcRegFilt4, srcRegFilt7));
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                                 _mm_min_epi16(srcRegFilt6, srcRegFilt8));
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+                                 _mm_max_epi16(srcRegFilt4, srcRegFilt7));
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                                 _mm_max_epi16(srcRegFilt6, srcRegFilt8));
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
+
+    src_ptr+=src_pitch;
+
+    // save 16 bytes convolve result
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+
+    output_ptr+=out_pitch;
+  }
+}

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 8023466..1a9ab60 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -1047,28 +1047,9 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *const p = x->plane;
-  struct macroblockd_plane *const pd = xd->plane;
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
 
   const int mb_mode_index = ctx->best_mode_index;
-  int max_plane;
-
-  max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1;
-  for (i = 0; i < max_plane; ++i) {
-    p[i].coeff = ctx->coeff_pbuf[i][1];
-    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
-    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
-    p[i].eobs = ctx->eobs_pbuf[i][1];
-  }
-
-  for (i = max_plane; i < MAX_MB_PLANE; ++i) {
-    p[i].coeff = ctx->coeff_pbuf[i][2];
-    p[i].qcoeff = ctx->qcoeff_pbuf[i][2];
-    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
-    p[i].eobs = ctx->eobs_pbuf[i][2];
-  }
-
   x->skip = ctx->skip;
 
   if (frame_is_intra_only(cm)) {
@@ -1128,8 +1109,8 @@
 }
 
 static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
-                      TOKENEXTRA **tp, int mi_row, int mi_col,
-                      int output_enabled, BLOCK_SIZE bsize) {
+                         TOKENEXTRA **tp, int mi_row, int mi_col,
+                         int output_enabled, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
@@ -1147,7 +1128,6 @@
     ctx = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
                                  mi_row, mi_col, bsize);
     subsize = mi_8x8[0]->mbmi.sb_type;
-
   } else {
     ctx = 0;
     subsize = BLOCK_4X4;
@@ -2272,11 +2252,11 @@
 }
 
 static void rtc_use_partition(VP9_COMP *cpi,
-                             const TileInfo *const tile,
-                             MODE_INFO **mi_8x8,
-                             TOKENEXTRA **tp, int mi_row, int mi_col,
-                             BLOCK_SIZE bsize, int *rate, int64_t *dist,
-                             int do_recon) {
+                              const TileInfo *const tile,
+                              MODE_INFO **mi_8x8,
+                              TOKENEXTRA **tp, int mi_row, int mi_col,
+                              BLOCK_SIZE bsize, int *rate, int64_t *dist,
+                              int do_recon) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
@@ -2334,6 +2314,7 @@
       }
     }
   }
+
   encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, BLOCK_64X64);
 
   *rate = chosen_rate;
@@ -2417,6 +2398,22 @@
 
   set_prev_mi(cm);
 
+  if (cpi->sf.use_pick_mode) {
+    // Initialize internal buffer pointers for rtc coding, where non-RD
+    // mode decision is used and hence no buffer pointer swap needed.
+    int i;
+    struct macroblock_plane *const p = x->plane;
+    struct macroblockd_plane *const pd = xd->plane;
+    PICK_MODE_CONTEXT *ctx = &cpi->mb.sb64_context;
+
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
+      p[i].coeff = ctx->coeff_pbuf[i][0];
+      p[i].qcoeff = ctx->qcoeff_pbuf[i][0];
+      pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0];
+      p[i].eobs = ctx->eobs_pbuf[i][0];
+    }
+  }
+
   {
     struct vpx_usec_timer emr_timer;
     vpx_usec_timer_start(&emr_timer);
@@ -2700,6 +2697,7 @@
   const int mis = cm->mode_info_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
+
   x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8 &&
                    (cpi->oxcf.aq_mode != COMPLEXITY_AQ) &&
                    !cpi->sf.use_pick_mode;

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 556197c..c2aac3e 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -65,7 +65,7 @@
 
   double target_q = (vp9_convert_qindex_to_q(qindex) * 0.5847) + 1.0;
 
-  for (i = 0; i < QINDEX_RANGE; i++) {
+  for (i = 0; i < QINDEX_RANGE; ++i) {
     if (target_q <= vp9_convert_qindex_to_q(i)) {
       ret_val = i;
       break;
@@ -399,7 +399,7 @@
   // Refine the motion search range according to the frame dimension
   // for first pass test.
   while ((quart_frm << sr) < MAX_FULL_PEL_VAL)
-    sr++;
+    ++sr;
 
   step_param += sr;
   further_steps -= sr;
@@ -427,10 +427,10 @@
   num00 = 0;
 
   while (n < further_steps) {
-    n++;
+    ++n;
 
     if (num00) {
-      num00--;
+      --num00;
     } else {
       tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv,
                                         step_param + n, x->sadperbit16,
@@ -522,7 +522,7 @@
   // Tiling is ignored in the first pass.
   vp9_tile_init(&tile, cm, 0, 0);
 
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
     int_mv best_ref_mv;
 
     best_ref_mv.as_int = 0;
@@ -538,7 +538,7 @@
     x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
                     + BORDER_MV_PIXELS_B16;
 
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
       int this_error;
       const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
       double error_weight = 1.0;
@@ -638,7 +638,7 @@
           }
 
           if (gf_motion_error < motion_error && gf_motion_error < this_error)
-            second_ref_count++;
+            ++second_ref_count;
 
           // Reset to last frame as reference buffer.
           xd->plane[0].pre[0].buf = lst_yv12->y_buffer + recon_yoffset;
@@ -665,7 +665,7 @@
           // cropped clips with black bars at the sides or top and bottom.
           if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
               this_error < 2 * intrapenalty)
-            neutral_count++;
+            ++neutral_count;
 
           mv.as_mv.row *= 8;
           mv.as_mv.col *= 8;
@@ -682,42 +682,42 @@
           sum_mvc_abs += abs(mv.as_mv.col);
           sum_mvrs += mv.as_mv.row * mv.as_mv.row;
           sum_mvcs += mv.as_mv.col * mv.as_mv.col;
-          intercount++;
+          ++intercount;
 
           best_ref_mv.as_int = mv.as_int;
 
           if (mv.as_int) {
-            mvcount++;
+            ++mvcount;
 
             // Non-zero vector, was it different from the last non zero vector?
             if (mv.as_int != lastmv_as_int)
-              new_mv_count++;
+              ++new_mv_count;
             lastmv_as_int = mv.as_int;
 
             // Does the row vector point inwards or outwards?
             if (mb_row < cm->mb_rows / 2) {
               if (mv.as_mv.row > 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
               else if (mv.as_mv.row < 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
             } else if (mb_row > cm->mb_rows / 2) {
               if (mv.as_mv.row > 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
               else if (mv.as_mv.row < 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
             }
 
             // Does the col vector point inwards or outwards?
             if (mb_col < cm->mb_cols / 2) {
               if (mv.as_mv.col > 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
               else if (mv.as_mv.col < 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
             } else if (mb_col > cm->mb_cols / 2) {
               if (mv.as_mv.col > 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
               else if (mv.as_mv.col < 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
             }
           }
         }
@@ -802,7 +802,7 @@
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
     twopass->sr_update_lag = 1;
   } else {
-    twopass->sr_update_lag++;
+    ++twopass->sr_update_lag;
   }
   // Swap frame pointers so last frame refers to the frame we just compressed.
   swap_yv12(lst_yv12, new_yv12);
@@ -830,7 +830,7 @@
     fclose(recon_file);
   }
 
-  cm->current_video_frame++;
+  ++cm->current_video_frame;
 }
 
 // Estimate a cost per mb attributable to overheads such as the coding of modes
@@ -910,7 +910,7 @@
 
   // Try and pick a max Q that will be high enough to encode the
   // content at the given rate.
-  for (q = rc->best_quality; q < rc->worst_quality; q++) {
+  for (q = rc->best_quality; q < rc->worst_quality; ++q) {
     const double err_correction_factor = calc_correction_factor(err_per_mb,
                                              ERR_DIVISOR, 0.5, 0.90, q);
     const int bits_per_mb_at_this_q = vp9_rc_bits_per_mb(INTER_FRAME, q,
@@ -1045,7 +1045,7 @@
     FIRSTPASS_STATS tmp_next_frame;
 
     // Look ahead a few frames to see if static condition persists...
-    for (j = 0; j < still_interval; j++) {
+    for (j = 0; j < still_interval; ++j) {
       if (EOF == input_stats(&cpi->twopass, &tmp_next_frame))
         break;
 
@@ -1164,7 +1164,7 @@
   int flash_detected = 0;
 
   // Search forward from the proposed arf/next gf position.
-  for (i = 0; i < f_frames; i++) {
+  for (i = 0; i < f_frames; ++i) {
     if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF)
       break;
 
@@ -1201,7 +1201,7 @@
   abs_mv_in_out_accumulator = 0.0;
 
   // Search backward towards last gf position.
-  for (i = -1; i >= -b_frames; i--) {
+  for (i = -1; i >= -b_frames; --i) {
     if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF)
       break;
 
@@ -1443,7 +1443,7 @@
 
   i = 0;
   while (i < twopass->static_scene_max_gf_interval && i < rc->frames_to_key) {
-    i++;
+    ++i;
 
     // Accumulate error score of frames in this gf group.
     mod_frame_err = calculate_modified_err(cpi, this_frame);
@@ -1515,7 +1515,7 @@
   // Don't allow a gf too near the next kf.
   if ((rc->frames_to_key - i) < MIN_GF_INTERVAL) {
     while (i < (rc->frames_to_key + !rc->next_key_frame_forced)) {
-      i++;
+      ++i;
 
       if (EOF == input_stats(twopass, this_frame))
         break;
@@ -1752,7 +1752,7 @@
     zero_stats(&sectionstats);
     reset_fpf_position(twopass, start_pos);
 
-    for (i = 0; i < rc->baseline_gf_interval; i++) {
+    for (i = 0; i < rc->baseline_gf_interval; ++i) {
       input_stats(twopass, &next_frame);
       accumulate_stats(&sectionstats, &next_frame);
     }
@@ -1837,7 +1837,7 @@
     start_pos = cpi->twopass.stats_in;
 
     // Examine how well the key frame predicts subsequent frames.
-    for (i = 0; i < 16; i++) {
+    for (i = 0; i < 16; ++i) {
       double next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error /
                              DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
 
@@ -1956,7 +1956,7 @@
       // quality since the last GF or KF.
       recent_loop_decay[i % 8] = loop_decay_rate;
       decay_accumulator = 1.0;
-      for (j = 0; j < 8; j++)
+      for (j = 0; j < 8; ++j)
         decay_accumulator *= recent_loop_decay[j];
 
       // Special check for transition or high motion followed by a
@@ -1966,7 +1966,7 @@
         break;
 
       // Step on to the next frame.
-      rc->frames_to_key++;
+      ++rc->frames_to_key;
 
       // If we don't have a real key frame within the next two
       // key_frame_frequency intervals then break out of the loop.
@@ -1975,7 +1975,7 @@
     } else {
       ++rc->frames_to_key;
     }
-    i++;
+    ++i;
   }
 
   // If there is a max kf interval set by the user we must obey it.
@@ -1997,7 +1997,7 @@
     kf_group_err = 0;
 
     // Rescan to get the correct error data for the forced kf group.
-    for (i = 0; i < rc->frames_to_key; i++) {
+    for (i = 0; i < rc->frames_to_key; ++i) {
       // Accumulate kf group errors.
       kf_group_err += calculate_modified_err(cpi, &tmp_frame);
 
@@ -2046,7 +2046,7 @@
   boost_score = 0.0;
 
   // Scan through the kf group collating various stats.
-  for (i = 0; i < rc->frames_to_key; i++) {
+  for (i = 0; i < rc->frames_to_key; ++i) {
     double r;
 
     if (EOF == input_stats(twopass, &next_frame))
@@ -2089,7 +2089,7 @@
     zero_stats(&sectionstats);
     reset_fpf_position(twopass, start_position);
 
-    for (i = 0; i < rc->frames_to_key; i++) {
+    for (i = 0; i < rc->frames_to_key; ++i) {
       input_stats(twopass, &next_frame);
       accumulate_stats(&sectionstats, &next_frame);
     }

diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 4561e76..23274fc 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c

@@ -217,6 +217,7 @@
 
 // Computes a q delta (in "q index" terms) to get from a starting q value
 // to a value that should equate to thegiven rate ratio.
+
 static int compute_qdelta_by_rate(VP9_COMP *cpi, int base_q_index,
                                   double rate_target_ratio) {
   int i;
@@ -1110,7 +1111,7 @@
 
 void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
   VP9_COMMON *const cm = &cpi->common;
-  int64_t vbr_max_bits;
+  int vbr_max_bits;
 
   if (framerate < 0.1)
     framerate = 30;
@@ -1134,10 +1135,10 @@
   // be acheived because of a user specificed max q (e.g. when the user
   // specifies lossless encode.
   //
-  vbr_max_bits = ((int64_t)cpi->rc.av_per_frame_bandwidth *
-                  (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
+  vbr_max_bits = (int)(((int64_t)cpi->rc.av_per_frame_bandwidth *
+      cpi->oxcf.two_pass_vbrmax_section) / 100);
   cpi->rc.max_frame_bandwidth =
-    MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+      MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
 
   // Set Maximum gf/arf interval
   cpi->rc.max_gf_interval = 16;
@@ -1158,7 +1159,7 @@
     cpi->rc.max_gf_interval = cpi->twopass.static_scene_max_gf_interval;
 }
 
-static int64_t rescale(int val, int64_t num, int denom) {
+static int64_t rescale(int64_t val, int64_t num, int denom) {
   int64_t llnum = num;
   int64_t llden = denom;
   int64_t llval = val;
@@ -1211,9 +1212,12 @@
     lc->target_bandwidth = oxcf->ts_target_bitrate[temporal_layer] * 1000;
     bitrate_alloc = (float)lc->target_bandwidth / (float)target_bandwidth;
     // Update buffer-related quantities.
-    lc->starting_buffer_level = oxcf->starting_buffer_level * bitrate_alloc;
-    lc->optimal_buffer_level = oxcf->optimal_buffer_level * bitrate_alloc;
-    lc->maximum_buffer_size = oxcf->maximum_buffer_size * bitrate_alloc;
+    lc->starting_buffer_level =
+        (int64_t)(oxcf->starting_buffer_level * bitrate_alloc);
+    lc->optimal_buffer_level =
+        (int64_t)(oxcf->optimal_buffer_level * bitrate_alloc);
+    lc->maximum_buffer_size =
+        (int64_t)(oxcf->maximum_buffer_size * bitrate_alloc);
     lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size);
     lrc->buffer_level = MIN(lrc->buffer_level, lc->maximum_buffer_size);
     // Update framerate-related quantities.
@@ -1245,8 +1249,8 @@
     int prev_layer_target_bandwidth =
         oxcf->ts_target_bitrate[temporal_layer - 1] * 1000;
     lc->avg_frame_size =
-        (int)(lc->target_bandwidth - prev_layer_target_bandwidth) /
-        (lc->framerate - prev_layer_framerate);
+        (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
+              (lc->framerate - prev_layer_framerate));
   }
 }
 
@@ -1274,7 +1278,7 @@
   int temporal_layer = cpi->svc.temporal_layer_id;
   LAYER_CONTEXT *lc = &cpi->svc.layer_context[temporal_layer];
   lc->rc = cpi->rc;
-  lc->target_bandwidth = cpi->oxcf.target_bandwidth;
+  lc->target_bandwidth = (int)cpi->oxcf.target_bandwidth;
   lc->starting_buffer_level = cpi->oxcf.starting_buffer_level;
   lc->optimal_buffer_level = cpi->oxcf.optimal_buffer_level;
   lc->maximum_buffer_size = cpi->oxcf.maximum_buffer_size;
@@ -1491,7 +1495,7 @@
 
   if (cpi->svc.number_temporal_layers > 1 &&
       cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
-    update_layer_context_change_config(cpi, cpi->oxcf.target_bandwidth);
+    update_layer_context_change_config(cpi, (int)cpi->oxcf.target_bandwidth);
   }
 
   cpi->speed = abs(cpi->oxcf.cpu_used);
@@ -1568,6 +1572,7 @@
   int num_pix = num_4x4_blk << 4;
   int i, k;
   ctx->num_4x4_blk = num_4x4_blk;
+
   CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
                   vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
   for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -1611,7 +1616,6 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x  = &cpi->mb;
 
-
   for (i = 0; i < BLOCK_SIZES; ++i) {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
     const int num_4x4_h = num_4x4_blocks_high_lookup[i];
@@ -2029,10 +2033,12 @@
                   / time_encoded;
 
       if (cpi->b_calculate_psnr) {
-        const double total_psnr = vp9_mse2psnr(cpi->total_samples, 255.0,
-                                               cpi->total_sq_error);
-        const double totalp_psnr = vp9_mse2psnr(cpi->totalp_samples, 255.0,
-                                                cpi->totalp_sq_error);
+        const double total_psnr =
+            vp9_mse2psnr((double)cpi->total_samples, 255.0,
+                         (double)cpi->total_sq_error);
+        const double totalp_psnr =
+            vp9_mse2psnr((double)cpi->totalp_samples, 255.0,
+                         (double)cpi->totalp_sq_error);
         const double total_ssim = 100 * pow(cpi->summed_quality /
                                                 cpi->summed_weights, 8.0);
         const double totalp_ssim = 100 * pow(cpi->summedp_quality /
@@ -2208,20 +2214,20 @@
     const int w = widths[i];
     const int h = heights[i];
     const uint32_t samples = w * h;
-    const double sse = calc_plane_error(a_planes[i], a_strides[i],
-                                        b_planes[i], b_strides[i],
-                                        w, h);
+    const uint64_t sse = calc_plane_error(a_planes[i], a_strides[i],
+                                          b_planes[i], b_strides[i],
+                                          w, h);
     psnr->sse[1 + i] = sse;
     psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = vp9_mse2psnr(samples, 255.0, sse);
+    psnr->psnr[1 + i] = vp9_mse2psnr(samples, 255.0, (double) sse);
 
-    total_sse += sse;
+    total_sse += (uint64_t)sse;
     total_samples += samples;
   }
 
   psnr->sse[0] = total_sse;
   psnr->samples[0] = total_samples;
-  psnr->psnr[0] = vp9_mse2psnr(total_samples, 255.0, total_sse);
+  psnr->psnr[0] = vp9_mse2psnr((double)total_samples, 255.0, (double)total_sse);
 }
 
 static void generate_psnr_packet(VP9_COMP *cpi) {
@@ -2892,7 +2898,7 @@
       if (!cpi->sf.use_pick_mode)
         vp9_pack_bitstream(cpi, dest, size);
 
-      cpi->rc.projected_frame_size = (*size) << 3;
+      cpi->rc.projected_frame_size = (int)(*size) << 3;
       vp9_restore_coding_context(cpi);
 
       if (frame_over_shoot_limit == 0)
@@ -3762,7 +3768,7 @@
 #if CONFIG_INTERNAL_STATS
 
   if (cpi->pass != 1) {
-    cpi->bytes += *size;
+    cpi->bytes += (int)(*size);
 
     if (cm->show_frame) {
       cpi->count++;

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 822185a..945fa81 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -127,11 +127,52 @@
   // calculate the bit cost on motion vector
   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv,
                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-
-
   return bestsme;
 }
 
+static void sub_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                    const TileInfo *const tile,
+                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                    int_mv *tmp_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+  int ref = mbmi->ref_frame[0];
+  int_mv ref_mv = mbmi->ref_mvs[ref][0];
+  int dis;
+
+  const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
+                                                                        ref);
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+
+    setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+  tmp_mv->as_mv.col >>= 3;
+  tmp_mv->as_mv.row >>= 3;
+
+  cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
+                               cpi->common.allow_high_precision_mv,
+                               x->errorperbit,
+                               &cpi->fn_ptr[bsize],
+                               cpi->sf.subpel_force_stop,
+                               cpi->sf.subpel_iters_per_step,
+                               x->nmvjointcost, x->mvcost,
+                               &dis, &x->pred_sse[ref]);
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
+}
+
 // TODO(jingning) placeholder for inter-frame non-RD mode decision.
 // this needs various further optimizations. to be continued..
 int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
@@ -161,6 +202,7 @@
 
   // initialize mode decisions
   *returnrate = INT_MAX;
+  *returndistortion = INT64_MAX;
   vpx_memset(mbmi, 0, sizeof(MB_MODE_INFO));
   mbmi->sb_type = bsize;
   mbmi->ref_frame[0] = NONE;
@@ -200,9 +242,6 @@
       int64_t dist;
 
       if (this_mode == NEWMV) {
-        if (this_rd < 300)
-          continue;
-
         x->mode_sad[ref_frame][INTER_OFFSET(NEWMV)] =
             full_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
                                      &frame_mv[NEWMV][ref_frame], &rate_mv);
@@ -226,6 +265,13 @@
   }
 
   // TODO(jingning) sub-pixel motion search, if NEWMV is chosen
+  if (mbmi->mode == NEWMV) {
+    ref_frame = mbmi->ref_frame[0];
+    sub_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
+                            &frame_mv[NEWMV][ref_frame]);
+    mbmi->mv[0].as_int = frame_mv[NEWMV][ref_frame].as_int;
+    xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
+  }
 
   // TODO(jingning) intra prediction search, if the best SAD is above a certain
   // threshold.

diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 862573f..372c362 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c

@@ -26,7 +26,7 @@
                       const int16_t *dequant_ptr,
                       int zbin_oq_value, uint16_t *eob_ptr,
                       const int16_t *scan, const int16_t *iscan) {
-  int i, non_zero_count = count, eob = -1;
+  int i, non_zero_count = (int)count, eob = -1;
   const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
                          zbin_ptr[1] + zbin_oq_value };
   const int nzbins[2] = { zbins[0] * -1,
@@ -37,7 +37,7 @@
 
   if (!skip_block) {
     // Pre-scan pass
-    for (i = count - 1; i >= 0; i--) {
+    for (i = (int)count - 1; i >= 0; i--) {
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
 

diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 4e2e268..2be00ff 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c

@@ -257,7 +257,8 @@
 
     add_token(&t, coef_probs[band[c]][pt],
               vp9_dct_value_tokens_ptr[v].extra,
-              vp9_dct_value_tokens_ptr[v].token, skip_eob,
+              (uint8_t)vp9_dct_value_tokens_ptr[v].token,
+              (uint8_t)skip_eob,
               counts[band[c]][pt]);
     eob_branch[band[c]][pt] += !skip_eob;
 

diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
new file mode 100644
index 0000000..a8f98e9
--- /dev/null
+++ b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c

@@ -0,0 +1,640 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+#include "vpx_ports/mem.h"
+#include "vp9/encoder/vp9_variance.h"
+
+DECLARE_ALIGNED(32, const unsigned char, vp9_bilinear_filters_avx2[512])= {
+  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+  15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
+  15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
+  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+  13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
+  13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
+  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+  11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
+  11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
+  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+  9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
+  9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
+  7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
+  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+  5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
+  5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
+  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+  3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
+  3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
+  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+  1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15,
+  1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15};
+
+unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
+                                             int src_stride,
+                                             int x_offset,
+                                             int y_offset,
+                                             const uint8_t *dst,
+                                             int dst_stride,
+                                             int height,
+                                             unsigned int *sse) {
+  __m256i src_reg,  dst_reg,  exp_src_lo,  exp_src_hi,  exp_dst_lo,  exp_dst_hi;
+  __m256i sse_reg,  sum_reg,  sse_reg_hi,  res_cmp,  sum_reg_lo,  sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  if (x_offset == 0) {
+    // x_offset = 0 and y_offset = 0
+    if (y_offset == 0) {
+      for (i = 0; i < height ; i++) {
+        // load source and destination
+        src_reg = _mm256_loadu_si256((__m256i const *) (src));
+        dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+        // expend each byte to 2 bytes
+        exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+        exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+
+        exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+        exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+        // source - dest
+        exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+        exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+        // calculate sum
+        sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+        exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+        sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+        exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+        // calculate sse
+        sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+        sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 0 and y_offset = 8
+    } else if (y_offset == 8) {
+        __m256i src_next_reg;
+        for (i = 0; i < height ; i++) {
+          // load source + next source + destination
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *)
+                                         (src + src_stride));
+          dst_reg = _mm256_load_si256((__m256i const *) (dst));
+          // average between current and next stride source
+          src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+          // expend each byte to 2 bytes
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+
+          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+          // source - dest
+          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+          // calculate sum
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+
+          // calculate sse
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+          src+= src_stride;
+          dst+= dst_stride;
+        }
+    // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+        __m256i filter, pw8, src_next_reg;
+#if (ARCH_X86_64)
+        int64_t y_offset64;
+        y_offset64 = y_offset;
+        y_offset64 <<= 5;
+        filter = _mm256_load_si256((__m256i const *)
+                 (vp9_bilinear_filters_avx2 + y_offset64));
+#else
+        y_offset <<= 5;
+        filter = _mm256_load_si256((__m256i const *)
+                 (vp9_bilinear_filters_avx2 + y_offset));
+#endif
+        pw8 = _mm256_set1_epi16(8);
+        for (i = 0; i < height ; i++) {
+          // load current and next source + destination
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *)
+                          (src + src_stride));
+          dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+          // merge current and next source
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+          // filter the source
+          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+          // add 8 to the source
+          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+          // divide by 16
+          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+          // expand each byte to 2 byte in the destination
+          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+          // source - dest
+          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+          // calculate sum
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+          // calculate sse
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+          src+= src_stride;
+          dst+= dst_stride;
+        }
+    }
+  // x_offset = 8  and y_offset = 0
+  } else if (x_offset == 8) {
+      if (y_offset == 0) {
+        __m256i src_next_reg;
+        for (i = 0; i < height ; i++) {
+          // load source and another source starting from the next
+          // following byte + destination
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+          dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+          // average between source and the next byte following source
+          src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+          // expand each byte to 2 bytes
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+
+          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+          // source - dest
+          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+          // calculate sum
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+          // calculate sse
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+          src+= src_stride;
+          dst+= dst_stride;
+        }
+      // x_offset = 8  and y_offset = 8
+      } else if (y_offset == 8) {
+          __m256i src_next_reg, src_avg;
+          // load source and another source starting from the next
+          // following byte
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+
+          // average between source and the next byte following source
+          src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+          for (i = 0; i < height ; i++) {
+            src+= src_stride;
+            // load source and another source starting from the next
+            // following byte + destination
+            src_reg = _mm256_loadu_si256((__m256i const *) (src));
+            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+            dst_reg = _mm256_load_si256((__m256i const *) (dst));
+            // average between source and the next byte following source
+            src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+            // expand each byte to 2 bytes
+            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+            // average between previous average to current average
+            src_avg = _mm256_avg_epu8(src_avg, src_reg);
+            // expand each byte to 2 bytes
+            exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg);
+
+            // save current source average
+            src_avg = src_reg;
+            // source - dest
+            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+            // calculate sum
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+            // calculate sse
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+            dst+= dst_stride;
+          }
+      // x_offset = 8  and y_offset = bilin interpolation
+      } else {
+          __m256i filter, pw8, src_next_reg, src_avg;
+#if (ARCH_X86_64)
+          int64_t y_offset64;
+          y_offset64 = y_offset;
+          y_offset64 <<= 5;
+          filter = _mm256_load_si256((__m256i const *)
+                   (vp9_bilinear_filters_avx2+y_offset64));
+#else
+          y_offset <<= 5;
+          filter = _mm256_load_si256((__m256i const *)
+                   (vp9_bilinear_filters_avx2 + y_offset));
+#endif
+          pw8 = _mm256_set1_epi16(8);
+          // load source and another source starting from the next
+          // following byte
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+          // average between source and the next byte following source
+          src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+          for (i = 0; i < height ; i++) {
+            src+= src_stride;
+            // load source and another source starting from the next
+            // following byte + destination
+            src_reg = _mm256_loadu_si256((__m256i const *) (src));
+            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+            dst_reg = _mm256_load_si256((__m256i const *) (dst));
+            // average between source and the next byte following source
+            src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+            // merge previous average and current average
+            exp_src_lo = _mm256_unpacklo_epi8(src_avg, src_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_avg, src_reg);
+
+            // filter the source
+            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+            // add 8 to the source
+            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+            // divide the source by 16
+            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+            // expand each byte to 2 bytes
+            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+            // save current source average
+            src_avg = src_reg;
+            // source - dest
+            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+            // calculate sum
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+            // calculate sse
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+            dst+= dst_stride;
+          }
+      }
+  // x_offset = bilin interpolation and y_offset = 0
+  } else {
+      if (y_offset == 0) {
+        __m256i filter, pw8, src_next_reg;
+#if (ARCH_X86_64)
+        int64_t x_offset64;
+        x_offset64 = x_offset;
+        x_offset64 <<= 5;
+        filter = _mm256_load_si256((__m256i const *)
+                (vp9_bilinear_filters_avx2+x_offset64));
+#else
+        x_offset <<= 5;
+        filter = _mm256_load_si256((__m256i const *)
+                 (vp9_bilinear_filters_avx2 + x_offset));
+#endif
+        pw8 = _mm256_set1_epi16(8);
+        for (i = 0; i < height ; i++) {
+          // load source and another source starting from the next
+          // following byte + destination
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+          dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+          // merge current and next source
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+          // filter the source
+          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+          // add 8 to source
+          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+          // divide the source by 16
+          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+          // expand each byte to 2 bytes
+          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+          // source - dest
+          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+          // calculate sum
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+          // calculate sse
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+          src+= src_stride;
+          dst+= dst_stride;
+        }
+      // x_offset = bilin interpolation and y_offset = 8
+      } else if (y_offset == 8) {
+          __m256i filter, pw8, src_next_reg, src_pack;
+#if (ARCH_X86_64)
+          int64_t x_offset64;
+          x_offset64 = x_offset;
+          x_offset64 <<= 5;
+          filter = _mm256_load_si256((__m256i const *)
+                  (vp9_bilinear_filters_avx2+x_offset64));
+#else
+          x_offset <<= 5;
+          filter = _mm256_load_si256((__m256i const *)
+                  (vp9_bilinear_filters_avx2 + x_offset));
+#endif
+          pw8 = _mm256_set1_epi16(8);
+          // load source and another source starting from the next
+          // following byte
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+
+          // merge current and next stride source
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+          // filter the source
+          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+          // add 8 to source
+          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+          // divide source by 16
+          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+          // convert each 16 bit to 8 bit to each low and high lane source
+          src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+          for (i = 0; i < height ; i++) {
+            src+= src_stride;
+
+            // load source and another source starting from the next
+            // following byte + destination
+            src_reg = _mm256_loadu_si256((__m256i const *) (src));
+            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+            dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+            // merge current and next stride source
+            exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+            // filter the source
+            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+            // add 8 to source
+            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+            // divide source by 16
+            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+            // convert each 16 bit to 8 bit to each low and high lane source
+            src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+            // average between previous pack to the current
+            src_pack = _mm256_avg_epu8(src_pack, src_reg);
+
+            // expand each byte to 2 bytes
+            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+            exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg);
+
+            // source - dest
+            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+            // calculate sum
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+            // calculate sse
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+            // save previous pack
+            src_pack = src_reg;
+            dst+= dst_stride;
+          }
+      // x_offset = bilin interpolation and y_offset = bilin interpolation
+      } else {
+          __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+#if (ARCH_X86_64)
+          int64_t x_offset64, y_offset64;
+          x_offset64 = x_offset;
+          x_offset64 <<= 5;
+          y_offset64 = y_offset;
+          y_offset64 <<= 5;
+          xfilter = _mm256_load_si256((__m256i const *)
+                   (vp9_bilinear_filters_avx2+x_offset64));
+          yfilter = _mm256_load_si256((__m256i const *)
+                   (vp9_bilinear_filters_avx2+y_offset64));
+#else
+          x_offset <<= 5;
+          xfilter = _mm256_load_si256((__m256i const *)
+                   (vp9_bilinear_filters_avx2 + x_offset));
+          y_offset <<= 5;
+          yfilter = _mm256_load_si256((__m256i const *)
+                   (vp9_bilinear_filters_avx2 + y_offset));
+#endif
+          pw8 = _mm256_set1_epi16(8);
+          // load source and another source starting from the next
+          // following byte
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+          // merge current and next stride source
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+          // filter the source
+          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, xfilter);
+          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, xfilter);
+
+          // add 8 to the source
+          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+          // divide the source by 16
+          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+          // convert each 16 bit to 8 bit to each low and high lane source
+          src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+          for (i = 0; i < height ; i++) {
+            src+= src_stride;
+            // load source and another source starting from the next
+            // following byte + destination
+            src_reg = _mm256_loadu_si256((__m256i const *) (src));
+            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+            dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+            // merge current and next stride source
+            exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+            // filter the source
+            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, xfilter);
+            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, xfilter);
+
+            // add 8 to source
+            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+            // divide source by 16
+            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+            // convert each 16 bit to 8 bit to each low and high lane source
+            src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+
+            // merge previous pack to current pack source
+            exp_src_lo = _mm256_unpacklo_epi8(src_pack, src_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_pack, src_reg);
+
+            // filter the source
+            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, yfilter);
+            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, yfilter);
+
+            // expand each byte to 2 bytes
+            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+            // add 8 to source
+            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+            // divide source by 16
+            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+            // source - dest
+            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+            // caculate sum
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+            // calculate sse
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+            src_pack = src_reg;
+            dst+= dst_stride;
+          }
+      }
+  }
+  // sum < 0
+  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);
+  // save the next 8 bytes of each lane of sse
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 8);
+  // merge the result of sum < 0  with sum to add sign to the next 16 bits
+  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);
+  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);
+  // add each 8 bytes from every lane of sse and sum
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);
+  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);
+
+  // save the next 4 bytes of each lane sse
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 4);
+  // save the next 8 bytes of each lane of sum
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 8);
+
+  // add the first 4 bytes to the next 4 bytes sse
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);
+  // add the first 8 bytes to the next 8 bytes
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);
+  // extract the low lane and the high lane and add the results
+  *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +
+                _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1));
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 4);
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);
+  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +
+        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+  return sum;
+}

diff --git a/vp9/encoder/x86/vp9_variance_avx2.c b/vp9/encoder/x86/vp9_variance_avx2.c
index c9b90d5..02007a3 100644
--- a/vp9/encoder/x86/vp9_variance_avx2.c
+++ b/vp9/encoder/x86/vp9_variance_avx2.c

@@ -42,6 +42,18 @@
   int *Sum
 );
 
+unsigned int vp9_sub_pixel_variance32xh_avx2
+(
+  const uint8_t *src,
+  int src_stride,
+  int x_offset,
+  int y_offset,
+  const uint8_t *dst,
+  int dst_stride,
+  int height,
+  unsigned int *sse
+);
+
 static void variance_avx2(const unsigned char *src_ptr, int  source_stride,
                         const unsigned char *ref_ptr, int  recon_stride,
                         int  w, int  h, unsigned int *sse, int *sum,
@@ -155,3 +167,43 @@
   *sse = var;
   return (var - (((int64_t)avg * avg) >> 11));
 }
+
+unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
+                                              int src_stride,
+                                              int x_offset,
+                                              int y_offset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse_ptr) {
+  // processing 32 elements in parallel
+  unsigned int sse;
+  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                           y_offset, dst, dst_stride,
+                                           64, &sse);
+  // processing the next 32 elements in parallel
+  unsigned int sse2;
+  int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
+                                            x_offset, y_offset,
+                                            dst + 32, dst_stride,
+                                            64, &sse2);
+  se += se2;
+  sse += sse2;
+  *sse_ptr = sse;
+  return sse - (((int64_t)se * se) >> 12);
+}
+
+unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
+                                              int src_stride,
+                                              int x_offset,
+                                              int y_offset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse_ptr) {
+  // processing 32 element in parallel
+  unsigned int sse;
+  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                           y_offset, dst, dst_stride,
+                                           32, &sse);
+  *sse_ptr = sse;
+  return sse - (((int64_t)se * se) >> 10);
+}

diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 48d6a7c..a448b3c 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk

@@ -80,6 +80,7 @@
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm
 VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 8072f78..fbdad74 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c

@@ -365,7 +365,7 @@
     memcpy(oxcf->ts_rate_decimator, cfg.ts_rate_decimator,
            sizeof(cfg.ts_rate_decimator));
   } else if (oxcf->ts_number_layers == 1) {
-    oxcf->ts_target_bitrate[0] = oxcf->target_bandwidth;
+    oxcf->ts_target_bitrate[0] = (int)oxcf->target_bandwidth;
     oxcf->ts_rate_decimator[0] = 1;
   }
 
@@ -639,7 +639,7 @@
 
     *x++ = marker;
     for (i = 0; i < ctx->pending_frame_count; i++) {
-      int this_sz = ctx->pending_frame_sizes[i];
+      unsigned int this_sz = (unsigned int)ctx->pending_frame_sizes[i];
 
       for (j = 0; j <= mag; j++) {
         *x++ = this_sz & 0xff;
@@ -1049,7 +1049,7 @@
     return VPX_CODEC_INVALID_PARAM;
   }
   if (cpi->svc.spatial_layer_id < 0 ||
-      cpi->svc.spatial_layer_id >= ctx->cfg.ss_number_layers) {
+      cpi->svc.spatial_layer_id >= (int)ctx->cfg.ss_number_layers) {
     return VPX_CODEC_INVALID_PARAM;
   }
   return VPX_CODEC_OK;

diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 1941fc0..76cbebf 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c

@@ -214,7 +214,7 @@
                        ? sizeof(vp9_stream_info_t)
                        : sizeof(vpx_codec_stream_info_t);
   memcpy(si, &ctx->si, sz);
-  si->sz = sz;
+  si->sz = (unsigned int)sz;
 
   return VPX_CODEC_OK;
 }
@@ -462,7 +462,7 @@
     while (data_start < data_end && *data_start == 0)
       data_start++;
 
-    data_sz = data_end - data_start;
+    data_sz = (unsigned int)(data_end - data_start);
   } while (data_start < data_end);
   return res;
 }

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index c0d973b..27dd6f6 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -86,6 +86,7 @@
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
 

diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index d0ac1af..f7dde62 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h

@@ -297,9 +297,16 @@
   int alt_fb_idx;             /**< alt reference frame frame buffer index */
 } vpx_svc_parameters_t;
 
+/*!\brief  vp9 svc layer parameters
+ *
+ * This defines the spatial and temporal layer id numbers for svc encoding.
+ * This is used with the #VP9E_SET_SVC_LAYER_ID control to set the spatial and
+ * temporal layer id for the current frame.
+ *
+ */
 typedef struct vpx_svc_layer_id {
-  int spatial_layer_id;
-  int temporal_layer_id;
+  int spatial_layer_id;       /**< Spatial layer id number. */
+  int temporal_layer_id;      /**< Temporal layer id number. */
 } vpx_svc_layer_id_t;
 
 /*!\brief VP8 encoder control function parameter type

diff --git a/vpxdec.c b/vpxdec.c
index d8157d0..660f613 100644
--- a/vpxdec.c
+++ b/vpxdec.c

@@ -791,7 +791,8 @@
 
         vpx_usec_timer_start(&timer);
 
-        if (vpx_codec_decode(&decoder, buf, bytes_in_buffer, NULL, 0)) {
+        if (vpx_codec_decode(&decoder, buf, (unsigned int)bytes_in_buffer,
+                             NULL, 0)) {
           const char *detail = vpx_codec_error_detail(&decoder);
           warn("Failed to decode frame %d: %s",
                frame_in, vpx_codec_error(&decoder));
@@ -873,7 +874,7 @@
                                         vpx_input_ctx.height,
                                         &vpx_input_ctx.framerate, img->fmt);
             if (do_md5) {
-              MD5Update(&md5_ctx, (md5byte *)buf, len);
+              MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
             } else {
               fputs(buf, outfile);
             }
@@ -882,7 +883,7 @@
           // Y4M frame header
           len = y4m_write_frame_header(buf, sizeof(buf));
           if (do_md5) {
-            MD5Update(&md5_ctx, (md5byte *)buf, len);
+            MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
           } else {
             fputs(buf, outfile);
           }