Merge "vp9_filter: move table alignment decl's to header"
diff --git a/docs.mk b/docs.mk
index 9426f76..797b466 100644
--- a/docs.mk
+++ b/docs.mk
@@ -30,7 +30,9 @@
EXAMPLE_PATH += $(SRC_PATH_BARE) #for CHANGELOG, README, etc
+EXAMPLE_PATH += $(SRC_PATH_BARE)/examples
+doxyfile: $(if $(findstring examples, $(ALL_TARGETS)),examples.doxy)
doxyfile: libs.doxy_template libs.doxy
@echo " [CREATE] $@"
@cat $^ > $@
diff --git a/examples.mk b/examples.mk
index e4abcf7..40756e1 100644
--- a/examples.mk
+++ b/examples.mk
@@ -285,3 +285,36 @@
$(addprefix bin/$(p)/,$(ALL_EXAMPLES_BASENAME:.c=.exe)))
$(foreach proj,$(call enabled,PROJECTS),\
$(eval $(call vcproj_template,$(proj))))
+
+#
+# Documentation Rules
+#
+%.dox: %.c
+ @echo " [DOXY] $@"
+ @echo "/*!\page example_$(@F:.dox=) $(@F:.dox=)" > $@
+ @echo " \includelineno $(<F)" >> $@
+ @echo "*/" >> $@
+
+samples.dox: examples.mk
+ @echo " [DOXY] $@"
+ @echo "/*!\page samples Sample Code" > $@
+ @echo " This SDK includes a number of sample applications."\
+ "Each sample documents a feature of the SDK in both prose"\
+ "and the associated C code."\
+ "The following samples are included: ">>$@
+ @$(foreach ex,$(sort $(notdir $(EXAMPLES:.c=))),\
+ echo " - \subpage example_$(ex) $($(ex).DESCRIPTION)" >> $@;)
+ @echo >> $@
+ @echo " In addition, the SDK contains a number of utilities."\
+ "Since these utilities are built upon the concepts described"\
+ "in the sample code listed above, they are not documented in"\
+ "pieces like the samples are. Their source is included here"\
+ "for reference. The following utilities are included:" >> $@
+ @$(foreach ex,$(sort $(UTILS:.c=)),\
+ echo " - \subpage example_$(ex) $($(ex).DESCRIPTION)" >> $@;)
+ @echo "*/" >> $@
+
+CLEAN-OBJS += examples.doxy samples.dox $(ALL_EXAMPLES:.c=.dox)
+DOCS-yes += examples.doxy samples.dox
+examples.doxy: samples.dox $(ALL_EXAMPLES:.c=.dox)
+ @echo "INPUT += $^" > $@
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon.asm
index 3aa15d2..dc9856f 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.asm
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.asm
@@ -315,8 +315,8 @@
vdup.u16 q2, r2
vadd.s16 q1, q1, q3
vadd.s16 q2, q2, q3
- vqshrun.s16 d0, q1, #0
- vqshrun.s16 d1, q2, #0
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d1[0]}, [r0], r1
@@ -327,8 +327,8 @@
vdup.u16 q2, r2
vadd.s16 q1, q1, q3
vadd.s16 q2, q2, q3
- vqshrun.s16 d0, q1, #0
- vqshrun.s16 d1, q2, #0
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d1[0]}, [r0], r1
bx lr
@@ -372,10 +372,10 @@
vadd.s16 q8, q3, q8
vadd.s16 q9, q3, q9
- vqshrun.s16 d0, q0, #0
- vqshrun.s16 d1, q1, #0
- vqshrun.s16 d2, q8, #0
- vqshrun.s16 d3, q9, #0
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q8
+ vqmovun.s16 d3, q9
vst1.64 {d0}, [r0], r1
vst1.64 {d1}, [r0], r1
@@ -394,10 +394,10 @@
vadd.s16 q8, q3, q8
vadd.s16 q9, q3, q9
- vqshrun.s16 d0, q0, #0
- vqshrun.s16 d1, q1, #0
- vqshrun.s16 d2, q8, #0
- vqshrun.s16 d3, q9, #0
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q8
+ vqmovun.s16 d3, q9
vst1.64 {d0}, [r0], r1
vst1.64 {d1}, [r0], r1
@@ -445,10 +445,10 @@
vadd.s16 q0, q0, q3
vadd.s16 q11, q8, q2
vadd.s16 q8, q8, q3
- vqshrun.s16 d2, q1, #0
- vqshrun.s16 d3, q0, #0
- vqshrun.s16 d22, q11, #0
- vqshrun.s16 d23, q8, #0
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
vdup.16 q0, d20[2] ; proload next 2 rows data
vdup.16 q8, d20[3]
vst1.64 {d2,d3}, [r0], r1
@@ -459,10 +459,10 @@
vadd.s16 q0, q0, q3
vadd.s16 q11, q8, q2
vadd.s16 q8, q8, q3
- vqshrun.s16 d2, q1, #0
- vqshrun.s16 d3, q0, #0
- vqshrun.s16 d22, q11, #0
- vqshrun.s16 d23, q8, #0
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
vdup.16 q0, d21[0] ; proload next 2 rows data
vdup.16 q8, d21[1]
vst1.64 {d2,d3}, [r0], r1
@@ -472,10 +472,10 @@
vadd.s16 q0, q0, q3
vadd.s16 q11, q8, q2
vadd.s16 q8, q8, q3
- vqshrun.s16 d2, q1, #0
- vqshrun.s16 d3, q0, #0
- vqshrun.s16 d22, q11, #0
- vqshrun.s16 d23, q8, #0
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
vdup.16 q0, d21[2] ; proload next 2 rows data
vdup.16 q8, d21[3]
vst1.64 {d2,d3}, [r0], r1
@@ -486,10 +486,10 @@
vadd.s16 q0, q0, q3
vadd.s16 q11, q8, q2
vadd.s16 q8, q8, q3
- vqshrun.s16 d2, q1, #0
- vqshrun.s16 d3, q0, #0
- vqshrun.s16 d22, q11, #0
- vqshrun.s16 d23, q8, #0
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
vld1.8 {d18}, [r3]! ; preload 8 left into r12
vmovl.u8 q10, d18
vst1.64 {d2,d3}, [r0], r1
@@ -542,19 +542,19 @@
vadd.s16 q13, q0, q9
vadd.s16 q14, q0, q10
vadd.s16 q15, q0, q11
- vqshrun.s16 d0, q12, #0
- vqshrun.s16 d1, q13, #0
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
vadd.s16 q12, q2, q8
vadd.s16 q13, q2, q9
- vqshrun.s16 d2, q14, #0
- vqshrun.s16 d3, q15, #0
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
vadd.s16 q14, q2, q10
vadd.s16 q15, q2, q11
vst1.64 {d0-d3}, [r0], r1
- vqshrun.s16 d24, q12, #0
- vqshrun.s16 d25, q13, #0
- vqshrun.s16 d26, q14, #0
- vqshrun.s16 d27, q15, #0
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
vdup.16 q1, d6[2]
vdup.16 q2, d6[3]
vst1.64 {d24-d27}, [r0], r1
@@ -564,19 +564,19 @@
vadd.s16 q13, q1, q9
vadd.s16 q14, q1, q10
vadd.s16 q15, q1, q11
- vqshrun.s16 d0, q12, #0
- vqshrun.s16 d1, q13, #0
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
vadd.s16 q12, q2, q8
vadd.s16 q13, q2, q9
- vqshrun.s16 d2, q14, #0
- vqshrun.s16 d3, q15, #0
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
vadd.s16 q14, q2, q10
vadd.s16 q15, q2, q11
vst1.64 {d0-d3}, [r0], r1
- vqshrun.s16 d24, q12, #0
- vqshrun.s16 d25, q13, #0
- vqshrun.s16 d26, q14, #0
- vqshrun.s16 d27, q15, #0
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
vdup.16 q0, d7[0]
vdup.16 q2, d7[1]
vst1.64 {d24-d27}, [r0], r1
@@ -586,19 +586,19 @@
vadd.s16 q13, q0, q9
vadd.s16 q14, q0, q10
vadd.s16 q15, q0, q11
- vqshrun.s16 d0, q12, #0
- vqshrun.s16 d1, q13, #0
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
vadd.s16 q12, q2, q8
vadd.s16 q13, q2, q9
- vqshrun.s16 d2, q14, #0
- vqshrun.s16 d3, q15, #0
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
vadd.s16 q14, q2, q10
vadd.s16 q15, q2, q11
vst1.64 {d0-d3}, [r0], r1
- vqshrun.s16 d24, q12, #0
- vqshrun.s16 d25, q13, #0
- vqshrun.s16 d26, q14, #0
- vqshrun.s16 d27, q15, #0
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
vdup.16 q0, d7[2]
vdup.16 q2, d7[3]
vst1.64 {d24-d27}, [r0], r1
@@ -608,20 +608,20 @@
vadd.s16 q13, q0, q9
vadd.s16 q14, q0, q10
vadd.s16 q15, q0, q11
- vqshrun.s16 d0, q12, #0
- vqshrun.s16 d1, q13, #0
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
vadd.s16 q12, q2, q8
vadd.s16 q13, q2, q9
- vqshrun.s16 d2, q14, #0
- vqshrun.s16 d3, q15, #0
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
vadd.s16 q14, q2, q10
vadd.s16 q15, q2, q11
vst1.64 {d0-d3}, [r0], r1
- vqshrun.s16 d24, q12, #0
- vqshrun.s16 d25, q13, #0
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
vld1.8 {d0}, [r3]! ; preload 8 left pixels
- vqshrun.s16 d26, q14, #0
- vqshrun.s16 d27, q15, #0
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
vmovl.u8 q3, d0
vst1.64 {d24-d27}, [r0], r1
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 6317103..a18ae9b 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -386,7 +386,7 @@
specialize vp9_variance4x4 mmx $sse2_x86inc
prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc
+specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc avx2
prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance64x64 $sse2_x86inc $ssse3_x86inc
@@ -416,7 +416,7 @@
specialize vp9_sub_pixel_avg_variance16x32 $sse2_x86inc $ssse3_x86inc
prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc
+specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc avx2
prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance32x32 $sse2_x86inc $ssse3_x86inc
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index a2cf910..1b4904c 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -142,20 +142,29 @@
#if HAVE_AVX2
filter8_1dfunction vp9_filter_block1d16_v8_avx2;
filter8_1dfunction vp9_filter_block1d16_h8_avx2;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+#if (ARCH_X86_64)
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
+#else
filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
+#endif
filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
-#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
-#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
-#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3
@@ -183,12 +192,26 @@
FUN_CONV_2D(, avx2);
#endif
#if HAVE_SSSE3
+#if (ARCH_X86_64)
+filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
+#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
+#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
+#else
filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#endif
filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
new file mode 100644
index 0000000..dbea141
--- /dev/null
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
@@ -0,0 +1,484 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+#include "vpx_ports/mem.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+// filters only for the 4_h8 convolution
+DECLARE_ALIGNED(16, const unsigned char, filt1_4_h8[16])= {
+ 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6};
+
+DECLARE_ALIGNED(16, const unsigned char, filt2_4_h8[16])= {
+ 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10};
+
+// filters for 8_h8 and 16_h8
+DECLARE_ALIGNED(16, const unsigned char, filt1_global[16])= {
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8};
+
+DECLARE_ALIGNED(16, const unsigned char, filt2_global[16])= {
+ 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10};
+
+DECLARE_ALIGNED(16, const unsigned char, filt3_global[16])= {
+ 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12};
+
+DECLARE_ALIGNED(16, const unsigned char, filt4_global[16])= {
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14};
+
+void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ int16_t *filter) {
+ __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+ __m128i addFilterReg64, filtersReg, srcReg, minReg;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((__m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits in the filter into the first lane
+ firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
+ // duplicate only the third 16 bit in the filter into the first lane
+ secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
+ // duplicate only the seconds 16 bits in the filter into the second lane
+ firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
+ // duplicate only the forth 16 bits in the filter into the second lane
+ secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
+
+ // loading the local filters
+ thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8);
+ forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8);
+
+ for (i = 0; i < output_height; i++) {
+ srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+ // filter the source buffer
+ srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+ // extract the higher half of the lane
+ srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
+ srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
+
+ minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+
+ // add and saturate all the results together
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+ srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+ // shift by 7 bit each 16 bits
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+ src_ptr+=src_pixels_per_line;
+
+ // save only 4 bytes
+ *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
+
+ output_ptr+=output_pitch;
+ }
+}
+
+void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ int16_t *filter) {
+ __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
+ __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+ __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+ __m128i addFilterReg64, filtersReg, minReg;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((__m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 128 bit register
+ firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 128 bit register
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 128 bit register
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 128 bit register
+ forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+ filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+ filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+ filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+ filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+ for (i = 0; i < output_height; i++) {
+ srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+ // filter the source buffer
+ srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+ // filter the source buffer
+ srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
+ srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
+ srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
+
+ // add and saturate all the results together
+ minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+
+ srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+ // shift by 7 bit each 16 bits
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+ src_ptr+=src_pixels_per_line;
+
+ // save only 8 bytes
+ _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+ output_ptr+=output_pitch;
+ }
+}
+
+void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ int16_t *filter) {
+ __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
+ __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+ __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((__m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 128 bit register
+ firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 128 bit register
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 128 bit register
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 128 bit register
+ forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+ filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+ filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+ filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+ filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+ for (i = 0; i < output_height; i++) {
+ srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+ // filter the source buffer
+ srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+ // filter the source buffer
+ srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+ _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+
+ // reading the next 16 bytes.
+ // (part of it was being read by earlier read)
+ srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+ _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+ // filter the source buffer
+ srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+ // add and saturate the results together
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+ // filter the source buffer
+ srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+ _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+ _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+ srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+ src_ptr+=src_pixels_per_line;
+
+ // save 16 bytes
+ _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+
+ output_ptr+=output_pitch;
+ }
+}
+
+void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ int16_t *filter) {
+ __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
+ __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((__m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits in the filter
+ firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+ // duplicate only the second 16 bits in the filter
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits in the filter
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits in the filter
+ forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+ for (i = 0; i < output_height; i++) {
+ // load the first 8 bytes
+ srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
+ // load the next 8 bytes in stride of src_pitch
+ srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
+ srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
+ srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
+
+ // merge the result together
+ srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
+ srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
+
+ // load the next 8 bytes in stride of src_pitch
+ srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
+ srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
+ srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
+ srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
+
+ // merge the result together
+ srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
+ srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+ srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
+
+ // add and saturate the results together
+ minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
+ srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+ src_ptr+=src_pitch;
+
+ // save only 8 bytes convolve result
+ _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+ output_ptr+=out_pitch;
+ }
+}
+
+void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ int16_t *filter) {
+ __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
+ __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((__m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits in the filter
+ firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+ // duplicate only the second 16 bits in the filter
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits in the filter
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits in the filter
+ forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+ for (i = 0; i < output_height; i++) {
+ // load the first 16 bytes
+ srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
+ // load the next 16 bytes in stride of src_pitch
+ srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
+ srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
+ srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+
+ // merge the result together
+ srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
+ srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
+ srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
+ srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
+ srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
+ srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+
+ // add and saturate the results together
+ srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+
+ // load the next 16 bytes in stride of two/three src_pitch
+ srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
+ srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
+
+ // merge the result together
+ srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
+ srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
+ srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
+
+ // load the next 16 bytes in stride of four/five src_pitch
+ srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
+ srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
+
+ // merge the result together
+ srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
+ srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
+ srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+ _mm_min_epi16(srcRegFilt4, srcRegFilt7));
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+ _mm_min_epi16(srcRegFilt6, srcRegFilt8));
+
+ // add and saturate the results together
+ srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+ _mm_max_epi16(srcRegFilt4, srcRegFilt7));
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+ _mm_max_epi16(srcRegFilt6, srcRegFilt8));
+ srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
+
+ src_ptr+=src_pitch;
+
+ // save 16 bytes convolve result
+ _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+
+ output_ptr+=out_pitch;
+ }
+}
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 8023466..1a9ab60 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1047,28 +1047,9 @@
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
- struct macroblock_plane *const p = x->plane;
- struct macroblockd_plane *const pd = xd->plane;
MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
const int mb_mode_index = ctx->best_mode_index;
- int max_plane;
-
- max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1;
- for (i = 0; i < max_plane; ++i) {
- p[i].coeff = ctx->coeff_pbuf[i][1];
- p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
- pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
- p[i].eobs = ctx->eobs_pbuf[i][1];
- }
-
- for (i = max_plane; i < MAX_MB_PLANE; ++i) {
- p[i].coeff = ctx->coeff_pbuf[i][2];
- p[i].qcoeff = ctx->qcoeff_pbuf[i][2];
- pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
- p[i].eobs = ctx->eobs_pbuf[i][2];
- }
-
x->skip = ctx->skip;
if (frame_is_intra_only(cm)) {
@@ -1128,8 +1109,8 @@
}
static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
- TOKENEXTRA **tp, int mi_row, int mi_col,
- int output_enabled, BLOCK_SIZE bsize) {
+ TOKENEXTRA **tp, int mi_row, int mi_col,
+ int output_enabled, BLOCK_SIZE bsize) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
@@ -1147,7 +1128,6 @@
ctx = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
mi_row, mi_col, bsize);
subsize = mi_8x8[0]->mbmi.sb_type;
-
} else {
ctx = 0;
subsize = BLOCK_4X4;
@@ -2272,11 +2252,11 @@
}
static void rtc_use_partition(VP9_COMP *cpi,
- const TileInfo *const tile,
- MODE_INFO **mi_8x8,
- TOKENEXTRA **tp, int mi_row, int mi_col,
- BLOCK_SIZE bsize, int *rate, int64_t *dist,
- int do_recon) {
+ const TileInfo *const tile,
+ MODE_INFO **mi_8x8,
+ TOKENEXTRA **tp, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int *rate, int64_t *dist,
+ int do_recon) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &cpi->mb.e_mbd;
@@ -2334,6 +2314,7 @@
}
}
}
+
encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, BLOCK_64X64);
*rate = chosen_rate;
@@ -2417,6 +2398,22 @@
set_prev_mi(cm);
+ if (cpi->sf.use_pick_mode) {
+ // Initialize internal buffer pointers for rtc coding, where non-RD
+ // mode decision is used and hence no buffer pointer swap needed.
+ int i;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ PICK_MODE_CONTEXT *ctx = &cpi->mb.sb64_context;
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ p[i].coeff = ctx->coeff_pbuf[i][0];
+ p[i].qcoeff = ctx->qcoeff_pbuf[i][0];
+ pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0];
+ p[i].eobs = ctx->eobs_pbuf[i][0];
+ }
+ }
+
{
struct vpx_usec_timer emr_timer;
vpx_usec_timer_start(&emr_timer);
@@ -2700,6 +2697,7 @@
const int mis = cm->mode_info_stride;
const int mi_width = num_8x8_blocks_wide_lookup[bsize];
const int mi_height = num_8x8_blocks_high_lookup[bsize];
+
x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8 &&
(cpi->oxcf.aq_mode != COMPLEXITY_AQ) &&
!cpi->sf.use_pick_mode;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 556197c..c2aac3e 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -65,7 +65,7 @@
double target_q = (vp9_convert_qindex_to_q(qindex) * 0.5847) + 1.0;
- for (i = 0; i < QINDEX_RANGE; i++) {
+ for (i = 0; i < QINDEX_RANGE; ++i) {
if (target_q <= vp9_convert_qindex_to_q(i)) {
ret_val = i;
break;
@@ -399,7 +399,7 @@
// Refine the motion search range according to the frame dimension
// for first pass test.
while ((quart_frm << sr) < MAX_FULL_PEL_VAL)
- sr++;
+ ++sr;
step_param += sr;
further_steps -= sr;
@@ -427,10 +427,10 @@
num00 = 0;
while (n < further_steps) {
- n++;
+ ++n;
if (num00) {
- num00--;
+ --num00;
} else {
tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv,
step_param + n, x->sadperbit16,
@@ -522,7 +522,7 @@
// Tiling is ignored in the first pass.
vp9_tile_init(&tile, cm, 0, 0);
- for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+ for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
int_mv best_ref_mv;
best_ref_mv.as_int = 0;
@@ -538,7 +538,7 @@
x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
+ BORDER_MV_PIXELS_B16;
- for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+ for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
int this_error;
const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
double error_weight = 1.0;
@@ -638,7 +638,7 @@
}
if (gf_motion_error < motion_error && gf_motion_error < this_error)
- second_ref_count++;
+ ++second_ref_count;
// Reset to last frame as reference buffer.
xd->plane[0].pre[0].buf = lst_yv12->y_buffer + recon_yoffset;
@@ -665,7 +665,7 @@
// cropped clips with black bars at the sides or top and bottom.
if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
this_error < 2 * intrapenalty)
- neutral_count++;
+ ++neutral_count;
mv.as_mv.row *= 8;
mv.as_mv.col *= 8;
@@ -682,42 +682,42 @@
sum_mvc_abs += abs(mv.as_mv.col);
sum_mvrs += mv.as_mv.row * mv.as_mv.row;
sum_mvcs += mv.as_mv.col * mv.as_mv.col;
- intercount++;
+ ++intercount;
best_ref_mv.as_int = mv.as_int;
if (mv.as_int) {
- mvcount++;
+ ++mvcount;
// Non-zero vector, was it different from the last non zero vector?
if (mv.as_int != lastmv_as_int)
- new_mv_count++;
+ ++new_mv_count;
lastmv_as_int = mv.as_int;
// Does the row vector point inwards or outwards?
if (mb_row < cm->mb_rows / 2) {
if (mv.as_mv.row > 0)
- sum_in_vectors--;
+ --sum_in_vectors;
else if (mv.as_mv.row < 0)
- sum_in_vectors++;
+ ++sum_in_vectors;
} else if (mb_row > cm->mb_rows / 2) {
if (mv.as_mv.row > 0)
- sum_in_vectors++;
+ ++sum_in_vectors;
else if (mv.as_mv.row < 0)
- sum_in_vectors--;
+ --sum_in_vectors;
}
// Does the col vector point inwards or outwards?
if (mb_col < cm->mb_cols / 2) {
if (mv.as_mv.col > 0)
- sum_in_vectors--;
+ --sum_in_vectors;
else if (mv.as_mv.col < 0)
- sum_in_vectors++;
+ ++sum_in_vectors;
} else if (mb_col > cm->mb_cols / 2) {
if (mv.as_mv.col > 0)
- sum_in_vectors++;
+ ++sum_in_vectors;
else if (mv.as_mv.col < 0)
- sum_in_vectors--;
+ --sum_in_vectors;
}
}
}
@@ -802,7 +802,7 @@
vp8_yv12_copy_frame(lst_yv12, gld_yv12);
twopass->sr_update_lag = 1;
} else {
- twopass->sr_update_lag++;
+ ++twopass->sr_update_lag;
}
// Swap frame pointers so last frame refers to the frame we just compressed.
swap_yv12(lst_yv12, new_yv12);
@@ -830,7 +830,7 @@
fclose(recon_file);
}
- cm->current_video_frame++;
+ ++cm->current_video_frame;
}
// Estimate a cost per mb attributable to overheads such as the coding of modes
@@ -910,7 +910,7 @@
// Try and pick a max Q that will be high enough to encode the
// content at the given rate.
- for (q = rc->best_quality; q < rc->worst_quality; q++) {
+ for (q = rc->best_quality; q < rc->worst_quality; ++q) {
const double err_correction_factor = calc_correction_factor(err_per_mb,
ERR_DIVISOR, 0.5, 0.90, q);
const int bits_per_mb_at_this_q = vp9_rc_bits_per_mb(INTER_FRAME, q,
@@ -1045,7 +1045,7 @@
FIRSTPASS_STATS tmp_next_frame;
// Look ahead a few frames to see if static condition persists...
- for (j = 0; j < still_interval; j++) {
+ for (j = 0; j < still_interval; ++j) {
if (EOF == input_stats(&cpi->twopass, &tmp_next_frame))
break;
@@ -1164,7 +1164,7 @@
int flash_detected = 0;
// Search forward from the proposed arf/next gf position.
- for (i = 0; i < f_frames; i++) {
+ for (i = 0; i < f_frames; ++i) {
if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF)
break;
@@ -1201,7 +1201,7 @@
abs_mv_in_out_accumulator = 0.0;
// Search backward towards last gf position.
- for (i = -1; i >= -b_frames; i--) {
+ for (i = -1; i >= -b_frames; --i) {
if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF)
break;
@@ -1443,7 +1443,7 @@
i = 0;
while (i < twopass->static_scene_max_gf_interval && i < rc->frames_to_key) {
- i++;
+ ++i;
// Accumulate error score of frames in this gf group.
mod_frame_err = calculate_modified_err(cpi, this_frame);
@@ -1515,7 +1515,7 @@
// Don't allow a gf too near the next kf.
if ((rc->frames_to_key - i) < MIN_GF_INTERVAL) {
while (i < (rc->frames_to_key + !rc->next_key_frame_forced)) {
- i++;
+ ++i;
if (EOF == input_stats(twopass, this_frame))
break;
@@ -1752,7 +1752,7 @@
zero_stats(§ionstats);
reset_fpf_position(twopass, start_pos);
- for (i = 0; i < rc->baseline_gf_interval; i++) {
+ for (i = 0; i < rc->baseline_gf_interval; ++i) {
input_stats(twopass, &next_frame);
accumulate_stats(§ionstats, &next_frame);
}
@@ -1837,7 +1837,7 @@
start_pos = cpi->twopass.stats_in;
// Examine how well the key frame predicts subsequent frames.
- for (i = 0; i < 16; i++) {
+ for (i = 0; i < 16; ++i) {
double next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error /
DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
@@ -1956,7 +1956,7 @@
// quality since the last GF or KF.
recent_loop_decay[i % 8] = loop_decay_rate;
decay_accumulator = 1.0;
- for (j = 0; j < 8; j++)
+ for (j = 0; j < 8; ++j)
decay_accumulator *= recent_loop_decay[j];
// Special check for transition or high motion followed by a
@@ -1966,7 +1966,7 @@
break;
// Step on to the next frame.
- rc->frames_to_key++;
+ ++rc->frames_to_key;
// If we don't have a real key frame within the next two
// key_frame_frequency intervals then break out of the loop.
@@ -1975,7 +1975,7 @@
} else {
++rc->frames_to_key;
}
- i++;
+ ++i;
}
// If there is a max kf interval set by the user we must obey it.
@@ -1997,7 +1997,7 @@
kf_group_err = 0;
// Rescan to get the correct error data for the forced kf group.
- for (i = 0; i < rc->frames_to_key; i++) {
+ for (i = 0; i < rc->frames_to_key; ++i) {
// Accumulate kf group errors.
kf_group_err += calculate_modified_err(cpi, &tmp_frame);
@@ -2046,7 +2046,7 @@
boost_score = 0.0;
// Scan through the kf group collating various stats.
- for (i = 0; i < rc->frames_to_key; i++) {
+ for (i = 0; i < rc->frames_to_key; ++i) {
double r;
if (EOF == input_stats(twopass, &next_frame))
@@ -2089,7 +2089,7 @@
zero_stats(§ionstats);
reset_fpf_position(twopass, start_position);
- for (i = 0; i < rc->frames_to_key; i++) {
+ for (i = 0; i < rc->frames_to_key; ++i) {
input_stats(twopass, &next_frame);
accumulate_stats(§ionstats, &next_frame);
}
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 4561e76..23274fc 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -217,6 +217,7 @@
// Computes a q delta (in "q index" terms) to get from a starting q value
// to a value that should equate to thegiven rate ratio.
+
static int compute_qdelta_by_rate(VP9_COMP *cpi, int base_q_index,
double rate_target_ratio) {
int i;
@@ -1110,7 +1111,7 @@
void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
VP9_COMMON *const cm = &cpi->common;
- int64_t vbr_max_bits;
+ int vbr_max_bits;
if (framerate < 0.1)
framerate = 30;
@@ -1134,10 +1135,10 @@
// be acheived because of a user specificed max q (e.g. when the user
// specifies lossless encode.
//
- vbr_max_bits = ((int64_t)cpi->rc.av_per_frame_bandwidth *
- (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
+ vbr_max_bits = (int)(((int64_t)cpi->rc.av_per_frame_bandwidth *
+ cpi->oxcf.two_pass_vbrmax_section) / 100);
cpi->rc.max_frame_bandwidth =
- MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+ MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
// Set Maximum gf/arf interval
cpi->rc.max_gf_interval = 16;
@@ -1158,7 +1159,7 @@
cpi->rc.max_gf_interval = cpi->twopass.static_scene_max_gf_interval;
}
-static int64_t rescale(int val, int64_t num, int denom) {
+static int64_t rescale(int64_t val, int64_t num, int denom) {
int64_t llnum = num;
int64_t llden = denom;
int64_t llval = val;
@@ -1211,9 +1212,12 @@
lc->target_bandwidth = oxcf->ts_target_bitrate[temporal_layer] * 1000;
bitrate_alloc = (float)lc->target_bandwidth / (float)target_bandwidth;
// Update buffer-related quantities.
- lc->starting_buffer_level = oxcf->starting_buffer_level * bitrate_alloc;
- lc->optimal_buffer_level = oxcf->optimal_buffer_level * bitrate_alloc;
- lc->maximum_buffer_size = oxcf->maximum_buffer_size * bitrate_alloc;
+ lc->starting_buffer_level =
+ (int64_t)(oxcf->starting_buffer_level * bitrate_alloc);
+ lc->optimal_buffer_level =
+ (int64_t)(oxcf->optimal_buffer_level * bitrate_alloc);
+ lc->maximum_buffer_size =
+ (int64_t)(oxcf->maximum_buffer_size * bitrate_alloc);
lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size);
lrc->buffer_level = MIN(lrc->buffer_level, lc->maximum_buffer_size);
// Update framerate-related quantities.
@@ -1245,8 +1249,8 @@
int prev_layer_target_bandwidth =
oxcf->ts_target_bitrate[temporal_layer - 1] * 1000;
lc->avg_frame_size =
- (int)(lc->target_bandwidth - prev_layer_target_bandwidth) /
- (lc->framerate - prev_layer_framerate);
+ (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
+ (lc->framerate - prev_layer_framerate));
}
}
@@ -1274,7 +1278,7 @@
int temporal_layer = cpi->svc.temporal_layer_id;
LAYER_CONTEXT *lc = &cpi->svc.layer_context[temporal_layer];
lc->rc = cpi->rc;
- lc->target_bandwidth = cpi->oxcf.target_bandwidth;
+ lc->target_bandwidth = (int)cpi->oxcf.target_bandwidth;
lc->starting_buffer_level = cpi->oxcf.starting_buffer_level;
lc->optimal_buffer_level = cpi->oxcf.optimal_buffer_level;
lc->maximum_buffer_size = cpi->oxcf.maximum_buffer_size;
@@ -1491,7 +1495,7 @@
if (cpi->svc.number_temporal_layers > 1 &&
cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
- update_layer_context_change_config(cpi, cpi->oxcf.target_bandwidth);
+ update_layer_context_change_config(cpi, (int)cpi->oxcf.target_bandwidth);
}
cpi->speed = abs(cpi->oxcf.cpu_used);
@@ -1568,6 +1572,7 @@
int num_pix = num_4x4_blk << 4;
int i, k;
ctx->num_4x4_blk = num_4x4_blk;
+
CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -1611,7 +1616,6 @@
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
-
for (i = 0; i < BLOCK_SIZES; ++i) {
const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
const int num_4x4_h = num_4x4_blocks_high_lookup[i];
@@ -2029,10 +2033,12 @@
/ time_encoded;
if (cpi->b_calculate_psnr) {
- const double total_psnr = vp9_mse2psnr(cpi->total_samples, 255.0,
- cpi->total_sq_error);
- const double totalp_psnr = vp9_mse2psnr(cpi->totalp_samples, 255.0,
- cpi->totalp_sq_error);
+ const double total_psnr =
+ vp9_mse2psnr((double)cpi->total_samples, 255.0,
+ (double)cpi->total_sq_error);
+ const double totalp_psnr =
+ vp9_mse2psnr((double)cpi->totalp_samples, 255.0,
+ (double)cpi->totalp_sq_error);
const double total_ssim = 100 * pow(cpi->summed_quality /
cpi->summed_weights, 8.0);
const double totalp_ssim = 100 * pow(cpi->summedp_quality /
@@ -2208,20 +2214,20 @@
const int w = widths[i];
const int h = heights[i];
const uint32_t samples = w * h;
- const double sse = calc_plane_error(a_planes[i], a_strides[i],
- b_planes[i], b_strides[i],
- w, h);
+ const uint64_t sse = calc_plane_error(a_planes[i], a_strides[i],
+ b_planes[i], b_strides[i],
+ w, h);
psnr->sse[1 + i] = sse;
psnr->samples[1 + i] = samples;
- psnr->psnr[1 + i] = vp9_mse2psnr(samples, 255.0, sse);
+ psnr->psnr[1 + i] = vp9_mse2psnr(samples, 255.0, (double) sse);
- total_sse += sse;
+ total_sse += (uint64_t)sse;
total_samples += samples;
}
psnr->sse[0] = total_sse;
psnr->samples[0] = total_samples;
- psnr->psnr[0] = vp9_mse2psnr(total_samples, 255.0, total_sse);
+ psnr->psnr[0] = vp9_mse2psnr((double)total_samples, 255.0, (double)total_sse);
}
static void generate_psnr_packet(VP9_COMP *cpi) {
@@ -2892,7 +2898,7 @@
if (!cpi->sf.use_pick_mode)
vp9_pack_bitstream(cpi, dest, size);
- cpi->rc.projected_frame_size = (*size) << 3;
+ cpi->rc.projected_frame_size = (int)(*size) << 3;
vp9_restore_coding_context(cpi);
if (frame_over_shoot_limit == 0)
@@ -3762,7 +3768,7 @@
#if CONFIG_INTERNAL_STATS
if (cpi->pass != 1) {
- cpi->bytes += *size;
+ cpi->bytes += (int)(*size);
if (cm->show_frame) {
cpi->count++;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 822185a..945fa81 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -127,11 +127,52 @@
// calculate the bit cost on motion vector
*rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv,
x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-
-
return bestsme;
}
+static void sub_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+ const TileInfo *const tile,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int_mv *tmp_mv) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+ struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+ int ref = mbmi->ref_frame[0];
+ int_mv ref_mv = mbmi->ref_mvs[ref][0];
+ int dis;
+
+ const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
+ ref);
+ if (scaled_ref_frame) {
+ int i;
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ backup_yv12[i] = xd->plane[i].pre[0];
+
+ setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+ }
+
+ tmp_mv->as_mv.col >>= 3;
+ tmp_mv->as_mv.row >>= 3;
+
+ cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
+ cpi->common.allow_high_precision_mv,
+ x->errorperbit,
+ &cpi->fn_ptr[bsize],
+ cpi->sf.subpel_force_stop,
+ cpi->sf.subpel_iters_per_step,
+ x->nmvjointcost, x->mvcost,
+ &dis, &x->pred_sse[ref]);
+
+ if (scaled_ref_frame) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ xd->plane[i].pre[0] = backup_yv12[i];
+ }
+}
+
// TODO(jingning) placeholder for inter-frame non-RD mode decision.
// this needs various further optimizations. to be continued..
int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
@@ -161,6 +202,7 @@
// initialize mode decisions
*returnrate = INT_MAX;
+ *returndistortion = INT64_MAX;
vpx_memset(mbmi, 0, sizeof(MB_MODE_INFO));
mbmi->sb_type = bsize;
mbmi->ref_frame[0] = NONE;
@@ -200,9 +242,6 @@
int64_t dist;
if (this_mode == NEWMV) {
- if (this_rd < 300)
- continue;
-
x->mode_sad[ref_frame][INTER_OFFSET(NEWMV)] =
full_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
&frame_mv[NEWMV][ref_frame], &rate_mv);
@@ -226,6 +265,13 @@
}
// TODO(jingning) sub-pixel motion search, if NEWMV is chosen
+ if (mbmi->mode == NEWMV) {
+ ref_frame = mbmi->ref_frame[0];
+ sub_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
+ &frame_mv[NEWMV][ref_frame]);
+ mbmi->mv[0].as_int = frame_mv[NEWMV][ref_frame].as_int;
+ xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
+ }
// TODO(jingning) intra prediction search, if the best SAD is above a certain
// threshold.
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 862573f..372c362 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -26,7 +26,7 @@
const int16_t *dequant_ptr,
int zbin_oq_value, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- int i, non_zero_count = count, eob = -1;
+ int i, non_zero_count = (int)count, eob = -1;
const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
zbin_ptr[1] + zbin_oq_value };
const int nzbins[2] = { zbins[0] * -1,
@@ -37,7 +37,7 @@
if (!skip_block) {
// Pre-scan pass
- for (i = count - 1; i >= 0; i--) {
+ for (i = (int)count - 1; i >= 0; i--) {
const int rc = scan[i];
const int coeff = coeff_ptr[rc];
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 4e2e268..2be00ff 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -257,7 +257,8 @@
add_token(&t, coef_probs[band[c]][pt],
vp9_dct_value_tokens_ptr[v].extra,
- vp9_dct_value_tokens_ptr[v].token, skip_eob,
+ (uint8_t)vp9_dct_value_tokens_ptr[v].token,
+ (uint8_t)skip_eob,
counts[band[c]][pt]);
eob_branch[band[c]][pt] += !skip_eob;
diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
new file mode 100644
index 0000000..a8f98e9
--- /dev/null
+++ b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
@@ -0,0 +1,640 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h> // AVX2
+#include "vpx_ports/mem.h"
+#include "vp9/encoder/vp9_variance.h"
+
+DECLARE_ALIGNED(32, const unsigned char, vp9_bilinear_filters_avx2[512])= {
+ 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+ 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+ 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
+ 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
+ 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+ 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+ 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
+ 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
+ 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+ 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+ 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
+ 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
+ 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+ 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+ 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
+ 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
+ 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
+ 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+ 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+ 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
+ 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
+ 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+ 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+ 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
+ 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
+ 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+ 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+ 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15,
+ 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15};
+
+unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
+ int src_stride,
+ int x_offset,
+ int y_offset,
+ const uint8_t *dst,
+ int dst_stride,
+ int height,
+ unsigned int *sse) {
+ __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+ __m256i zero_reg;
+ int i, sum;
+ sum_reg = _mm256_set1_epi16(0);
+ sse_reg = _mm256_set1_epi16(0);
+ zero_reg = _mm256_set1_epi16(0);
+
+ if (x_offset == 0) {
+ // x_offset = 0 and y_offset = 0
+ if (y_offset == 0) {
+ for (i = 0; i < height ; i++) {
+ // load source and destination
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+ // expend each byte to 2 bytes
+ exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+
+ exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+ exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+ // source - dest
+ exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+ exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+ // calculate sum
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+ exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+ exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+ // calculate sse
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+ src+= src_stride;
+ dst+= dst_stride;
+ }
+ // x_offset = 0 and y_offset = 8
+ } else if (y_offset == 8) {
+ __m256i src_next_reg;
+ for (i = 0; i < height ; i++) {
+ // load source + next source + destination
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_next_reg = _mm256_loadu_si256((__m256i const *)
+ (src + src_stride));
+ dst_reg = _mm256_load_si256((__m256i const *) (dst));
+ // average between current and next stride source
+ src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+ // expend each byte to 2 bytes
+ exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+
+ exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+ exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+ // source - dest
+ exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+ exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+ // calculate sum
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+ exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+ exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+
+ // calculate sse
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+ src+= src_stride;
+ dst+= dst_stride;
+ }
+ // x_offset = 0 and y_offset = bilin interpolation
+ } else {
+ __m256i filter, pw8, src_next_reg;
+#if (ARCH_X86_64)
+ int64_t y_offset64;
+ y_offset64 = y_offset;
+ y_offset64 <<= 5;
+ filter = _mm256_load_si256((__m256i const *)
+ (vp9_bilinear_filters_avx2 + y_offset64));
+#else
+ y_offset <<= 5;
+ filter = _mm256_load_si256((__m256i const *)
+ (vp9_bilinear_filters_avx2 + y_offset));
+#endif
+ pw8 = _mm256_set1_epi16(8);
+ for (i = 0; i < height ; i++) {
+ // load current and next source + destination
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_next_reg = _mm256_loadu_si256((__m256i const *)
+ (src + src_stride));
+ dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+ // merge current and next source
+ exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+ // filter the source
+ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+ exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+ // add 8 to the source
+ exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+ exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+ // divide by 16
+ exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+ exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+ // expand each byte to 2 byte in the destination
+ exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+ exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+ // source - dest
+ exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+ exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+ // calculate sum
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+ exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+ exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+ // calculate sse
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+ src+= src_stride;
+ dst+= dst_stride;
+ }
+ }
+ // x_offset = 8 and y_offset = 0
+ } else if (x_offset == 8) {
+ if (y_offset == 0) {
+ __m256i src_next_reg;
+ for (i = 0; i < height ; i++) {
+ // load source and another source starting from the next
+ // following byte + destination
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+ dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+ // average between source and the next byte following source
+ src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+ // expand each byte to 2 bytes
+ exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+
+ exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+ exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+ // source - dest
+ exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+ exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+ // calculate sum
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+ exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+ exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+ // calculate sse
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+ src+= src_stride;
+ dst+= dst_stride;
+ }
+ // x_offset = 8 and y_offset = 8
+ } else if (y_offset == 8) {
+ __m256i src_next_reg, src_avg;
+ // load source and another source starting from the next
+ // following byte
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+
+ // average between source and the next byte following source
+ src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+ for (i = 0; i < height ; i++) {
+ src+= src_stride;
+ // load source and another source starting from the next
+ // following byte + destination
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+ dst_reg = _mm256_load_si256((__m256i const *) (dst));
+ // average between source and the next byte following source
+ src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+ // expand each byte to 2 bytes
+ exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+ exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+ // average between previous average to current average
+ src_avg = _mm256_avg_epu8(src_avg, src_reg);
+ // expand each byte to 2 bytes
+ exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg);
+
+ // save current source average
+ src_avg = src_reg;
+ // source - dest
+ exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+ exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+ // calculate sum
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+ exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+ exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+ // calculate sse
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+ dst+= dst_stride;
+ }
+ // x_offset = 8 and y_offset = bilin interpolation
+ } else {
+ __m256i filter, pw8, src_next_reg, src_avg;
+#if (ARCH_X86_64)
+ int64_t y_offset64;
+ y_offset64 = y_offset;
+ y_offset64 <<= 5;
+ filter = _mm256_load_si256((__m256i const *)
+ (vp9_bilinear_filters_avx2+y_offset64));
+#else
+ y_offset <<= 5;
+ filter = _mm256_load_si256((__m256i const *)
+ (vp9_bilinear_filters_avx2 + y_offset));
+#endif
+ pw8 = _mm256_set1_epi16(8);
+ // load source and another source starting from the next
+ // following byte
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+ // average between source and the next byte following source
+ src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+ for (i = 0; i < height ; i++) {
+ src+= src_stride;
+ // load source and another source starting from the next
+ // following byte + destination
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+ dst_reg = _mm256_load_si256((__m256i const *) (dst));
+ // average between source and the next byte following source
+ src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+ // merge previous average and current average
+ exp_src_lo = _mm256_unpacklo_epi8(src_avg, src_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_avg, src_reg);
+
+ // filter the source
+ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+ exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+ // add 8 to the source
+ exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+ exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+ // divide the source by 16
+ exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+ exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+ // expand each byte to 2 bytes
+ exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+ exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+ // save current source average
+ src_avg = src_reg;
+ // source - dest
+ exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+ exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+ // calculate sum
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+ exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+ exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+ // calculate sse
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+ dst+= dst_stride;
+ }
+ }
+ // x_offset = bilin interpolation and y_offset = 0
+ } else {
+ if (y_offset == 0) {
+ __m256i filter, pw8, src_next_reg;
+#if (ARCH_X86_64)
+ int64_t x_offset64;
+ x_offset64 = x_offset;
+ x_offset64 <<= 5;
+ filter = _mm256_load_si256((__m256i const *)
+ (vp9_bilinear_filters_avx2+x_offset64));
+#else
+ x_offset <<= 5;
+ filter = _mm256_load_si256((__m256i const *)
+ (vp9_bilinear_filters_avx2 + x_offset));
+#endif
+ pw8 = _mm256_set1_epi16(8);
+ for (i = 0; i < height ; i++) {
+ // load source and another source starting from the next
+ // following byte + destination
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+ dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+ // merge current and next source
+ exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+ // filter the source
+ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+ exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+ // add 8 to source
+ exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+ exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+ // divide the source by 16
+ exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+ exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+ // expand each byte to 2 bytes
+ exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+ exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+ // source - dest
+ exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+ exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+ // calculate sum
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+ exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+ exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+ // calculate sse
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+ src+= src_stride;
+ dst+= dst_stride;
+ }
+ // x_offset = bilin interpolation and y_offset = 8
+ } else if (y_offset == 8) {
+ __m256i filter, pw8, src_next_reg, src_pack;
+#if (ARCH_X86_64)
+ int64_t x_offset64;
+ x_offset64 = x_offset;
+ x_offset64 <<= 5;
+ filter = _mm256_load_si256((__m256i const *)
+ (vp9_bilinear_filters_avx2+x_offset64));
+#else
+ x_offset <<= 5;
+ filter = _mm256_load_si256((__m256i const *)
+ (vp9_bilinear_filters_avx2 + x_offset));
+#endif
+ pw8 = _mm256_set1_epi16(8);
+ // load source and another source starting from the next
+ // following byte
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+
+ // merge current and next stride source
+ exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+ // filter the source
+ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+ exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+ // add 8 to source
+ exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+ exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+ // divide source by 16
+ exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+ exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+ // convert each 16 bit to 8 bit to each low and high lane source
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ for (i = 0; i < height ; i++) {
+ src+= src_stride;
+
+ // load source and another source starting from the next
+ // following byte + destination
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+ dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+ // merge current and next stride source
+ exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+ // filter the source
+ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+ exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+ // add 8 to source
+ exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+ exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+ // divide source by 16
+ exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+ exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+ // convert each 16 bit to 8 bit to each low and high lane source
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ // average between previous pack to the current
+ src_pack = _mm256_avg_epu8(src_pack, src_reg);
+
+ // expand each byte to 2 bytes
+ exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+ exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+ exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg);
+
+ // source - dest
+ exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+ exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+ // calculate sum
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+ exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+ exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+ // calculate sse
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+ // save previous pack
+ src_pack = src_reg;
+ dst+= dst_stride;
+ }
+ // x_offset = bilin interpolation and y_offset = bilin interpolation
+ } else {
+ __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+#if (ARCH_X86_64)
+ int64_t x_offset64, y_offset64;
+ x_offset64 = x_offset;
+ x_offset64 <<= 5;
+ y_offset64 = y_offset;
+ y_offset64 <<= 5;
+ xfilter = _mm256_load_si256((__m256i const *)
+ (vp9_bilinear_filters_avx2+x_offset64));
+ yfilter = _mm256_load_si256((__m256i const *)
+ (vp9_bilinear_filters_avx2+y_offset64));
+#else
+ x_offset <<= 5;
+ xfilter = _mm256_load_si256((__m256i const *)
+ (vp9_bilinear_filters_avx2 + x_offset));
+ y_offset <<= 5;
+ yfilter = _mm256_load_si256((__m256i const *)
+ (vp9_bilinear_filters_avx2 + y_offset));
+#endif
+ pw8 = _mm256_set1_epi16(8);
+ // load source and another source starting from the next
+ // following byte
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+ // merge current and next stride source
+ exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+ // filter the source
+ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, xfilter);
+ exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, xfilter);
+
+ // add 8 to the source
+ exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+ exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+ // divide the source by 16
+ exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+ exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+ // convert each 16 bit to 8 bit to each low and high lane source
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ for (i = 0; i < height ; i++) {
+ src+= src_stride;
+ // load source and another source starting from the next
+ // following byte + destination
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+ dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+ // merge current and next stride source
+ exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+ // filter the source
+ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, xfilter);
+ exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, xfilter);
+
+ // add 8 to source
+ exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+ exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+ // divide source by 16
+ exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+ exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+ // convert each 16 bit to 8 bit to each low and high lane source
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+
+ // merge previous pack to current pack source
+ exp_src_lo = _mm256_unpacklo_epi8(src_pack, src_reg);
+ exp_src_hi = _mm256_unpackhi_epi8(src_pack, src_reg);
+
+ // filter the source
+ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, yfilter);
+ exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, yfilter);
+
+ // expand each byte to 2 bytes
+ exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+ exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+ // add 8 to source
+ exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+ exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+ // divide source by 16
+ exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+ exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+ // source - dest
+ exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+ exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+ // caculate sum
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+ exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+ exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+ // calculate sse
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+ src_pack = src_reg;
+ dst+= dst_stride;
+ }
+ }
+ }
+ // sum < 0
+ res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);
+ // save the next 8 bytes of each lane of sse
+ sse_reg_hi = _mm256_srli_si256(sse_reg, 8);
+ // merge the result of sum < 0 with sum to add sign to the next 16 bits
+ sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);
+ sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);
+ // add each 8 bytes from every lane of sse and sum
+ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);
+ sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);
+
+ // save the next 4 bytes of each lane sse
+ sse_reg_hi = _mm256_srli_si256(sse_reg, 4);
+ // save the next 8 bytes of each lane of sum
+ sum_reg_hi = _mm256_srli_si256(sum_reg, 8);
+
+ // add the first 4 bytes to the next 4 bytes sse
+ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);
+ // add the first 8 bytes to the next 8 bytes
+ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);
+ // extract the low lane and the high lane and add the results
+ *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1));
+ sum_reg_hi = _mm256_srli_si256(sum_reg, 4);
+ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);
+ sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+ return sum;
+}
diff --git a/vp9/encoder/x86/vp9_variance_avx2.c b/vp9/encoder/x86/vp9_variance_avx2.c
index c9b90d5..02007a3 100644
--- a/vp9/encoder/x86/vp9_variance_avx2.c
+++ b/vp9/encoder/x86/vp9_variance_avx2.c
@@ -42,6 +42,18 @@
int *Sum
);
+unsigned int vp9_sub_pixel_variance32xh_avx2
+(
+ const uint8_t *src,
+ int src_stride,
+ int x_offset,
+ int y_offset,
+ const uint8_t *dst,
+ int dst_stride,
+ int height,
+ unsigned int *sse
+);
+
static void variance_avx2(const unsigned char *src_ptr, int source_stride,
const unsigned char *ref_ptr, int recon_stride,
int w, int h, unsigned int *sse, int *sum,
@@ -155,3 +167,43 @@
*sse = var;
return (var - (((int64_t)avg * avg) >> 11));
}
+
+unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
+ int src_stride,
+ int x_offset,
+ int y_offset,
+ const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse_ptr) {
+ // processing 32 elements in parallel
+ unsigned int sse;
+ int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+ y_offset, dst, dst_stride,
+ 64, &sse);
+ // processing the next 32 elements in parallel
+ unsigned int sse2;
+ int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
+ x_offset, y_offset,
+ dst + 32, dst_stride,
+ 64, &sse2);
+ se += se2;
+ sse += sse2;
+ *sse_ptr = sse;
+ return sse - (((int64_t)se * se) >> 12);
+}
+
+unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
+ int src_stride,
+ int x_offset,
+ int y_offset,
+ const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse_ptr) {
+ // processing 32 element in parallel
+ unsigned int sse;
+ int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+ y_offset, dst, dst_stride,
+ 32, &sse);
+ *sse_ptr = sse;
+ return sse - (((int64_t)se * se) >> 10);
+}
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 48d6a7c..a448b3c 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -80,6 +80,7 @@
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm
VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
ifeq ($(CONFIG_VP9_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 8072f78..fbdad74 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -365,7 +365,7 @@
memcpy(oxcf->ts_rate_decimator, cfg.ts_rate_decimator,
sizeof(cfg.ts_rate_decimator));
} else if (oxcf->ts_number_layers == 1) {
- oxcf->ts_target_bitrate[0] = oxcf->target_bandwidth;
+ oxcf->ts_target_bitrate[0] = (int)oxcf->target_bandwidth;
oxcf->ts_rate_decimator[0] = 1;
}
@@ -639,7 +639,7 @@
*x++ = marker;
for (i = 0; i < ctx->pending_frame_count; i++) {
- int this_sz = ctx->pending_frame_sizes[i];
+ unsigned int this_sz = (unsigned int)ctx->pending_frame_sizes[i];
for (j = 0; j <= mag; j++) {
*x++ = this_sz & 0xff;
@@ -1049,7 +1049,7 @@
return VPX_CODEC_INVALID_PARAM;
}
if (cpi->svc.spatial_layer_id < 0 ||
- cpi->svc.spatial_layer_id >= ctx->cfg.ss_number_layers) {
+ cpi->svc.spatial_layer_id >= (int)ctx->cfg.ss_number_layers) {
return VPX_CODEC_INVALID_PARAM;
}
return VPX_CODEC_OK;
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 1941fc0..76cbebf 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -214,7 +214,7 @@
? sizeof(vp9_stream_info_t)
: sizeof(vpx_codec_stream_info_t);
memcpy(si, &ctx->si, sz);
- si->sz = sz;
+ si->sz = (unsigned int)sz;
return VPX_CODEC_OK;
}
@@ -462,7 +462,7 @@
while (data_start < data_end && *data_start == 0)
data_start++;
- data_sz = data_end - data_start;
+ data_sz = (unsigned int)(data_end - data_start);
} while (data_start < data_end);
return res;
}
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index c0d973b..27dd6f6 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -86,6 +86,7 @@
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index d0ac1af..f7dde62 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -297,9 +297,16 @@
int alt_fb_idx; /**< alt reference frame frame buffer index */
} vpx_svc_parameters_t;
+/*!\brief vp9 svc layer parameters
+ *
+ * This defines the spatial and temporal layer id numbers for svc encoding.
+ * This is used with the #VP9E_SET_SVC_LAYER_ID control to set the spatial and
+ * temporal layer id for the current frame.
+ *
+ */
typedef struct vpx_svc_layer_id {
- int spatial_layer_id;
- int temporal_layer_id;
+ int spatial_layer_id; /**< Spatial layer id number. */
+ int temporal_layer_id; /**< Temporal layer id number. */
} vpx_svc_layer_id_t;
/*!\brief VP8 encoder control function parameter type
diff --git a/vpxdec.c b/vpxdec.c
index d8157d0..660f613 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -791,7 +791,8 @@
vpx_usec_timer_start(&timer);
- if (vpx_codec_decode(&decoder, buf, bytes_in_buffer, NULL, 0)) {
+ if (vpx_codec_decode(&decoder, buf, (unsigned int)bytes_in_buffer,
+ NULL, 0)) {
const char *detail = vpx_codec_error_detail(&decoder);
warn("Failed to decode frame %d: %s",
frame_in, vpx_codec_error(&decoder));
@@ -873,7 +874,7 @@
vpx_input_ctx.height,
&vpx_input_ctx.framerate, img->fmt);
if (do_md5) {
- MD5Update(&md5_ctx, (md5byte *)buf, len);
+ MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
} else {
fputs(buf, outfile);
}
@@ -882,7 +883,7 @@
// Y4M frame header
len = y4m_write_frame_header(buf, sizeof(buf));
if (do_md5) {
- MD5Update(&md5_ctx, (md5byte *)buf, len);
+ MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
} else {
fputs(buf, outfile);
}