SSE2 implementation of 4-tap filter for widths 8, 4

Added SSE2 implementation of aom_filter_block1d8_h4_sse2,
aom_filter_block1d8_v4_sse2, aom_filter_block1d4_h4_sse2
and aom_filter_block1d4_v4_sse2. Approximately 44%
improvement is seen w.r.t 8-tap filter at unit test level.

Change-Id: I7f68e136207983e99a7b7e4e49d07b09623afeff
diff --git a/aom_dsp/x86/aom_asm_stubs.c b/aom_dsp/x86/aom_asm_stubs.c
index d19fb60..2453764 100644
--- a/aom_dsp/x86/aom_asm_stubs.c
+++ b/aom_dsp/x86/aom_asm_stubs.c
@@ -24,10 +24,10 @@
 filter8_1dfunction aom_filter_block1d16_v4_sse2;
 filter8_1dfunction aom_filter_block1d16_h4_sse2;
 
-#define aom_filter_block1d8_h4_sse2 aom_filter_block1d8_h8_sse2
-#define aom_filter_block1d8_v4_sse2 aom_filter_block1d8_v8_sse2
-#define aom_filter_block1d4_h4_sse2 aom_filter_block1d4_h8_sse2
-#define aom_filter_block1d4_v4_sse2 aom_filter_block1d4_v8_sse2
+filter8_1dfunction aom_filter_block1d8_h4_sse2;
+filter8_1dfunction aom_filter_block1d8_v4_sse2;
+filter8_1dfunction aom_filter_block1d4_h4_sse2;
+filter8_1dfunction aom_filter_block1d4_v4_sse2;
 
 filter8_1dfunction aom_filter_block1d16_v2_sse2;
 filter8_1dfunction aom_filter_block1d16_h2_sse2;
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c b/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
index 2215cd7..d307312 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
@@ -258,3 +258,317 @@
     srcReg4 = srcReg6;
   }
 }
+
+void aom_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
+                                 ptrdiff_t src_pixels_per_line,
+                                 uint8_t *output_ptr, ptrdiff_t output_pitch,
+                                 uint32_t output_height,
+                                 const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
+  __m128i srcReg32b1;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
+
+  for (i = output_height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
+    __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
+    ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+    ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+    __m128i d1 = _mm_madd_epi16(ss_2, secondFilters);
+    __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
+    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+    __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
+    __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
+    ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+    ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
+    d1 = _mm_madd_epi16(ss_3, secondFilters);
+    d2 = _mm_madd_epi16(ss_5, thirdFilters);
+    srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
+
+    __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+    __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+    srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    src_ptr += src_pixels_per_line;
+
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+void aom_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
+                                 uint32_t output_height,
+                                 const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23_lo, srcReg34_lo;
+  __m128i srcReg45_lo, srcReg56_lo;
+  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+  __m128i resReg23_45_lo, resReg34_56_lo;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg32, secondFilters, thirdFilters;
+  __m128i tmp_0, tmp_1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
+  __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
+  __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
+
+  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
+  __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
+  __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
+    resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
+    resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
+    __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
+    resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
+    __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
+    resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    // add and saturate the results together
+    resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
+
+    // shift by 6 bit each 16 bit
+    resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
+    resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
+    resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packus_epi16(resReg23_45_lo, _mm_setzero_si128());
+    resReg34_56 = _mm_packus_epi16(resReg34_56_lo, _mm_setzero_si128());
+
+    src_ptr += src_stride;
+
+    _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
+    _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    resReg23_lo_1 = resReg45_lo_1;
+    resReg23_lo_2 = resReg45_lo_2;
+    resReg34_lo_1 = resReg56_lo_1;
+    resReg34_lo_2 = resReg56_lo_2;
+    srcReg4 = srcReg6;
+  }
+}
+
+void aom_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
+                                 ptrdiff_t src_pixels_per_line,
+                                 uint8_t *output_ptr, ptrdiff_t output_pitch,
+                                 uint32_t output_height,
+                                 const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1;
+  __m128i srcReg32b1;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
+
+  for (i = output_height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
+    __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
+    __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
+    __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
+
+    ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+    ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+    ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+    ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
+
+    __m128i ss_1_1 = _mm_unpacklo_epi32(ss_2, ss_3);
+    __m128i ss_1_2 = _mm_unpacklo_epi32(ss_4, ss_5);
+
+    __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
+    __m128i d2 = _mm_madd_epi16(ss_1_2, thirdFilters);
+    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+    srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    src_ptr += src_pixels_per_line;
+
+    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+void aom_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
+                                 uint32_t output_height,
+                                 const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23, srcReg34, srcReg45, srcReg56;
+  __m128i resReg23_34, resReg45_56;
+  __m128i resReg23_34_45_56;
+  __m128i addFilterReg32, secondFilters, thirdFilters;
+  __m128i tmp_0, tmp_1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
+  __m128i resReg23 = _mm_unpacklo_epi8(srcReg23, _mm_setzero_si128());
+
+  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+  __m128i resReg34 = _mm_unpacklo_epi8(srcReg34, _mm_setzero_si128());
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+    srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
+    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+    srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    tmp_0 = _mm_madd_epi16(resReg23, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg34, secondFilters);
+    resReg23_34 = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg45 = _mm_unpacklo_epi8(srcReg45, _mm_setzero_si128());
+    __m128i resReg56 = _mm_unpacklo_epi8(srcReg56, _mm_setzero_si128());
+
+    tmp_0 = _mm_madd_epi16(resReg45, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg56, thirdFilters);
+    resReg45_56 = _mm_packs_epi32(tmp_0, tmp_1);
+
+    // add and saturate the results together
+    resReg23_34_45_56 = _mm_adds_epi16(resReg23_34, resReg45_56);
+
+    // shift by 6 bit each 16 bit
+    resReg23_34_45_56 = _mm_adds_epi16(resReg23_34_45_56, addFilterReg32);
+    resReg23_34_45_56 = _mm_srai_epi16(resReg23_34_45_56, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_34_45_56 =
+        _mm_packus_epi16(resReg23_34_45_56, _mm_setzero_si128());
+
+    src_ptr += src_stride;
+
+    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReg23_34_45_56);
+    *((uint32_t *)(output_ptr + out_pitch)) =
+        _mm_cvtsi128_si32(_mm_srli_si128(resReg23_34_45_56, 4));
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    resReg23 = resReg45;
+    resReg34 = resReg56;
+    srcReg4 = srcReg6;
+  }
+}
diff --git a/test/comp_mask_variance_test.cc b/test/comp_mask_variance_test.cc
index e71f211..33e3d55 100644
--- a/test/comp_mask_variance_test.cc
+++ b/test/comp_mask_variance_test.cc
@@ -191,7 +191,8 @@
   const int h = block_size_high[bsize];
   int wedge_types = (1 << get_wedge_bits_lookup(bsize));
   int subpel_search;
-  for (subpel_search = 1; subpel_search <= 2; ++subpel_search) {
+  for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
+       ++subpel_search) {
     // loop through subx and suby
     for (int sub = 0; sub < 8 * 8; ++sub) {
       int subx = sub & 0x7;
@@ -231,7 +232,7 @@
   const int num_loops = 1000000000 / (w + h);
   comp_mask_pred_func funcs[2] = { &aom_comp_mask_pred_c, test_impl };
   double elapsed_time[2] = { 0 };
-  int subpel_search = 2;  // set to 1 to test 4-tap filter.
+  int subpel_search = USE_8_TAPS;  // set to USE_4_TAPS to test 4-tap filter.
   for (int i = 0; i < 2; ++i) {
     aom_usec_timer timer;
     aom_usec_timer_start(&timer);