SSSE3 implementation of 4-tap filter

Added SSSE3 implementation of aom_filter_block1d16_h4_ssse3
and aom_filter_block1d16_v4_ssse3 for block width >= 16.
Approximately 30% improvement is seen w.r.t 8-tap filter
at unit test level.

Change-Id: I5df3ba39463dba3a414bdf04871799cab89115a5
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index 325a21b..368fb17 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -45,6 +45,16 @@
   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
 };
 
+DECLARE_ALIGNED(32, static const uint8_t, filt_h4[]) = {
+  0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
+  2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
+  5,  6,  6,  7,  7, 8,  8,  9,  9,  10, 2,  3,  3,  4,  4,  5,  5,  6,  6,
+  7,  7,  8,  8,  9, 9,  10, 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10,
+  10, 11, 11, 12, 4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11, 11,
+  12, 6,  7,  7,  8, 8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 6,  7,
+  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
 // These are reused by the avx2 intrinsics.
 filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3;
 filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3;
@@ -280,6 +290,187 @@
   }
 }
 
+static void aom_filter_block1d16_h4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32, filt2Reg, filt3Reg;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m128i srcReg32b1, srcReg32b2;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+  filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
+  filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));
+
+  for (i = output_height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // reading stride of the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b2, filt2Reg);
+    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+    srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+    src_ptr += src_pixels_per_line;
+
+    _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+static void aom_filter_block1d16_v4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
+  __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
+  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+  __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
+  __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg32, secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
+  srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
+
+  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+
+  // have consecutive loads on the same 256 register
+  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
+  srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+
+    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
+    srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReg23_lo = _mm_maddubs_epi16(srcReg23_lo, secondFilters);
+    resReg34_lo = _mm_maddubs_epi16(srcReg34_lo, secondFilters);
+    resReg45_lo = _mm_maddubs_epi16(srcReg45_lo, thirdFilters);
+    resReg56_lo = _mm_maddubs_epi16(srcReg56_lo, thirdFilters);
+
+    // add and saturate the results together
+    resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    resReg23_hi = _mm_maddubs_epi16(srcReg23_hi, secondFilters);
+    resReg34_hi = _mm_maddubs_epi16(srcReg34_hi, secondFilters);
+    resReg45_hi = _mm_maddubs_epi16(srcReg45_hi, thirdFilters);
+    resReg56_hi = _mm_maddubs_epi16(srcReg56_hi, thirdFilters);
+
+    // add and saturate the results together
+    resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
+    resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
+
+    // shift by 6 bit each 16 bit
+    resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
+    resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
+    resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
+    resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
+    resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
+    resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
+    resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
+    resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
+
+    src_ptr += src_stride;
+
+    _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
+    _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_lo = srcReg45_lo;
+    srcReg34_lo = srcReg56_lo;
+    srcReg23_hi = srcReg45_hi;
+    srcReg34_hi = srcReg56_hi;
+    srcReg4 = srcReg6;
+  }
+}
+
 filter8_1dfunction aom_filter_block1d16_v8_ssse3;
 filter8_1dfunction aom_filter_block1d16_h8_ssse3;
 filter8_1dfunction aom_filter_block1d8_v8_ssse3;
@@ -287,8 +478,6 @@
 filter8_1dfunction aom_filter_block1d4_v8_ssse3;
 filter8_1dfunction aom_filter_block1d4_h8_ssse3;
 
-#define aom_filter_block1d16_h4_ssse3 aom_filter_block1d16_h8_ssse3
-#define aom_filter_block1d16_v4_ssse3 aom_filter_block1d16_v8_ssse3
 #define aom_filter_block1d8_h4_ssse3 aom_filter_block1d8_h8_ssse3
 #define aom_filter_block1d8_v4_ssse3 aom_filter_block1d8_v8_ssse3
 #define aom_filter_block1d4_h4_ssse3 aom_filter_block1d4_h8_ssse3