rtc: Optimize AVX2 intrinsic of aom_int_pro_col()
This CL optimizes aom_int_pro_col_avx2() by fully utilizing the
registers post sad operation. At module level scaling gains have
improved by 30% on average for different block sizes.
Encoder speed-up for RT preset,
Instruction Count
cpu Testset Reduction(%)
7 rtc 0.403
7 rtc_derf 0.516
8 rtc 0.448
8 rtc_derf 0.289
Change-Id: Ia366a173b6d0e3834c8e0b2bb62a4523ef9c85f4
diff --git a/aom_dsp/x86/avg_intrin_avx2.c b/aom_dsp/x86/avg_intrin_avx2.c
index 6f4436f..c85d8c5 100644
--- a/aom_dsp/x86/avg_intrin_avx2.c
+++ b/aom_dsp/x86/avg_intrin_avx2.c
@@ -588,75 +588,183 @@
}
}
+static INLINE void load_from_src_buf(const uint8_t *ref1, __m256i *src,
+ const int stride) {
+ src[0] = _mm256_loadu_si256((const __m256i *)ref1);
+ src[1] = _mm256_loadu_si256((const __m256i *)(ref1 + stride));
+ src[2] = _mm256_loadu_si256((const __m256i *)(ref1 + (2 * stride)));
+ src[3] = _mm256_loadu_si256((const __m256i *)(ref1 + (3 * stride)));
+}
+
+#define CALC_TOT_SAD_AND_STORE \
+ /* r00 r10 x x r01 r11 x x | r02 r12 x x r03 r13 x x */ \
+ const __m256i r01 = _mm256_add_epi16(_mm256_slli_si256(r1, 2), r0); \
+ /* r00 r10 r20 x r01 r11 r21 x | r02 r12 r22 x r03 r13 r23 x */ \
+ const __m256i r012 = _mm256_add_epi16(_mm256_slli_si256(r2, 4), r01); \
+ /* r00 r10 r20 r30 r01 r11 r21 r31 | r02 r12 r22 r32 r03 r13 r23 r33 */ \
+ const __m256i result0 = _mm256_add_epi16(_mm256_slli_si256(r3, 6), r012); \
+ \
+ const __m128i results0 = _mm_add_epi16( \
+ _mm256_castsi256_si128(result0), _mm256_extractf128_si256(result0, 1)); \
+ const __m128i results1 = \
+ _mm_add_epi16(results0, _mm_srli_si128(results0, 8)); \
+ _mm_storel_epi64((__m128i *)vbuf, _mm_srli_epi16(results1, norm_factor));
+
+static INLINE void aom_int_pro_col_16wd_avx2(int16_t *vbuf, const uint8_t *ref,
+ const int ref_stride,
+ const int height,
+ int norm_factor) {
+ const __m256i zero = _mm256_setzero_si256();
+ int ht = 0;
+ // Post sad operation, the data is present in lower 16-bit of each 64-bit lane
+ // and higher 16-bits are Zero. Here, we are processing 8 rows at a time to
+ // utilize the higher 16-bits efficiently.
+ do {
+ __m256i src_00 =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(ref)));
+ src_00 = _mm256_inserti128_si256(
+ src_00, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 4)), 1);
+ __m256i src_01 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(ref + ref_stride * 1)));
+ src_01 = _mm256_inserti128_si256(
+ src_01, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 5)), 1);
+ __m256i src_10 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(ref + ref_stride * 2)));
+ src_10 = _mm256_inserti128_si256(
+ src_10, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 6)), 1);
+ __m256i src_11 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(ref + ref_stride * 3)));
+ src_11 = _mm256_inserti128_si256(
+ src_11, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 7)), 1);
+
+ // s00 x x x s01 x x x | s40 x x x s41 x x x
+ const __m256i s0 = _mm256_sad_epu8(src_00, zero);
+ // s10 x x x s11 x x x | s50 x x x s51 x x x
+ const __m256i s1 = _mm256_sad_epu8(src_01, zero);
+ // s20 x x x s21 x x x | s60 x x x s61 x x x
+ const __m256i s2 = _mm256_sad_epu8(src_10, zero);
+ // s30 x x x s31 x x x | s70 x x x s71 x x x
+ const __m256i s3 = _mm256_sad_epu8(src_11, zero);
+
+ // s00 s10 x x x x x x | s40 s50 x x x x x x
+ const __m256i s0_lo = _mm256_unpacklo_epi16(s0, s1);
+ // s01 s11 x x x x x x | s41 s51 x x x x x x
+ const __m256i s0_hi = _mm256_unpackhi_epi16(s0, s1);
+ // s20 s30 x x x x x x | s60 s70 x x x x x x
+ const __m256i s1_lo = _mm256_unpacklo_epi16(s2, s3);
+ // s21 s31 x x x x x x | s61 s71 x x x x x x
+ const __m256i s1_hi = _mm256_unpackhi_epi16(s2, s3);
+
+ // s0 s1 x x x x x x | s4 s5 x x x x x x
+ const __m256i s0_add = _mm256_add_epi16(s0_lo, s0_hi);
+ // s2 s3 x x x x x x | s6 s7 x x x x x x
+ const __m256i s1_add = _mm256_add_epi16(s1_lo, s1_hi);
+
+ // s1 s1 s2 s3 s4 s5 s6 s7
+ const __m128i results = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_unpacklo_epi32(s0_add, s1_add), 0x08));
+ _mm_storeu_si128((__m128i *)vbuf, _mm_srli_epi16(results, norm_factor));
+ vbuf += 8;
+ ref += (ref_stride << 3);
+ ht += 8;
+ } while (ht < height);
+}
+
void aom_int_pro_col_avx2(int16_t *vbuf, const uint8_t *ref,
const int ref_stride, const int width,
const int height, int norm_factor) {
- // SIMD implementation assumes width to be multiple of 16. For any odd width,
- // SIMD support needs to be added.
assert(width % 16 == 0);
-
if (width == 128) {
const __m256i zero = _mm256_setzero_si256();
- for (int ht = 0; ht < height; ++ht) {
- const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)ref);
- const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
- const __m256i src_line2 = _mm256_loadu_si256((const __m256i *)(ref + 64));
- const __m256i src_line3 = _mm256_loadu_si256((const __m256i *)(ref + 96));
- const __m256i s0 = _mm256_sad_epu8(src_line0, zero);
- const __m256i s1 = _mm256_sad_epu8(src_line1, zero);
- const __m256i s2 = _mm256_sad_epu8(src_line2, zero);
- const __m256i s3 = _mm256_sad_epu8(src_line3, zero);
- const __m256i result0_256bit = _mm256_add_epi16(s0, s1);
- const __m256i result1_256bit = _mm256_add_epi16(s2, s3);
- const __m256i result_256bit =
- _mm256_add_epi16(result0_256bit, result1_256bit);
+ for (int ht = 0; ht < height; ht += 4) {
+ __m256i src[16];
+ // Load source data.
+ load_from_src_buf(ref, &src[0], ref_stride);
+ load_from_src_buf(ref + 32, &src[4], ref_stride);
+ load_from_src_buf(ref + 64, &src[8], ref_stride);
+ load_from_src_buf(ref + 96, &src[12], ref_stride);
- const __m128i result =
- _mm_add_epi16(_mm256_castsi256_si128(result_256bit),
- _mm256_extractf128_si256(result_256bit, 1));
- __m128i result1 = _mm_add_epi16(result, _mm_srli_si128(result, 8));
- vbuf[ht] = _mm_cvtsi128_si32(result1) >> norm_factor;
- ref += ref_stride;
+ // Row0 output: r00 x x x r01 x x x | r02 x x x r03 x x x
+ const __m256i s0 = _mm256_add_epi16(_mm256_sad_epu8(src[0], zero),
+ _mm256_sad_epu8(src[4], zero));
+ const __m256i s1 = _mm256_add_epi16(_mm256_sad_epu8(src[8], zero),
+ _mm256_sad_epu8(src[12], zero));
+ const __m256i r0 = _mm256_add_epi16(s0, s1);
+ // Row1 output: r10 x x x r11 x x x | r12 x x x r13 x x x
+ const __m256i s2 = _mm256_add_epi16(_mm256_sad_epu8(src[1], zero),
+ _mm256_sad_epu8(src[5], zero));
+ const __m256i s3 = _mm256_add_epi16(_mm256_sad_epu8(src[9], zero),
+ _mm256_sad_epu8(src[13], zero));
+ const __m256i r1 = _mm256_add_epi16(s2, s3);
+ // Row2 output: r20 x x x r21 x x x | r22 x x x r23 x x x
+ const __m256i s4 = _mm256_add_epi16(_mm256_sad_epu8(src[2], zero),
+ _mm256_sad_epu8(src[6], zero));
+ const __m256i s5 = _mm256_add_epi16(_mm256_sad_epu8(src[10], zero),
+ _mm256_sad_epu8(src[14], zero));
+ const __m256i r2 = _mm256_add_epi16(s4, s5);
+ // Row3 output: r30 x x x r31 x x x | r32 x x x r33 x x x
+ const __m256i s6 = _mm256_add_epi16(_mm256_sad_epu8(src[3], zero),
+ _mm256_sad_epu8(src[7], zero));
+ const __m256i s7 = _mm256_add_epi16(_mm256_sad_epu8(src[11], zero),
+ _mm256_sad_epu8(src[15], zero));
+ const __m256i r3 = _mm256_add_epi16(s6, s7);
+
+ CALC_TOT_SAD_AND_STORE
+ vbuf += 4;
+ ref += ref_stride << 2;
}
} else if (width == 64) {
const __m256i zero = _mm256_setzero_si256();
- for (int ht = 0; ht < height; ++ht) {
- const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)ref);
- const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
- const __m256i s1 = _mm256_sad_epu8(src_line0, zero);
- const __m256i s2 = _mm256_sad_epu8(src_line1, zero);
- const __m256i result_256bit = _mm256_add_epi16(s1, s2);
+ for (int ht = 0; ht < height; ht += 4) {
+ __m256i src[8];
+ // Load source data.
+ load_from_src_buf(ref, &src[0], ref_stride);
+ load_from_src_buf(ref + 32, &src[4], ref_stride);
- const __m128i result =
- _mm_add_epi16(_mm256_castsi256_si128(result_256bit),
- _mm256_extractf128_si256(result_256bit, 1));
- __m128i result1 = _mm_add_epi16(result, _mm_srli_si128(result, 8));
- vbuf[ht] = _mm_cvtsi128_si32(result1) >> norm_factor;
- ref += ref_stride;
+ // Row0 output: r00 x x x r01 x x x | r02 x x x r03 x x x
+ const __m256i s0 = _mm256_sad_epu8(src[0], zero);
+ const __m256i s1 = _mm256_sad_epu8(src[4], zero);
+ const __m256i r0 = _mm256_add_epi16(s0, s1);
+ // Row1 output: r10 x x x r11 x x x | r12 x x x r13 x x x
+ const __m256i s2 = _mm256_sad_epu8(src[1], zero);
+ const __m256i s3 = _mm256_sad_epu8(src[5], zero);
+ const __m256i r1 = _mm256_add_epi16(s2, s3);
+ // Row2 output: r20 x x x r21 x x x | r22 x x x r23 x x x
+ const __m256i s4 = _mm256_sad_epu8(src[2], zero);
+ const __m256i s5 = _mm256_sad_epu8(src[6], zero);
+ const __m256i r2 = _mm256_add_epi16(s4, s5);
+ // Row3 output: r30 x x x r31 x x x | r32 x x x r33 x x x
+ const __m256i s6 = _mm256_sad_epu8(src[3], zero);
+ const __m256i s7 = _mm256_sad_epu8(src[7], zero);
+ const __m256i r3 = _mm256_add_epi16(s6, s7);
+
+ CALC_TOT_SAD_AND_STORE
+ vbuf += 4;
+ ref += ref_stride << 2;
}
} else if (width == 32) {
assert(height % 2 == 0);
const __m256i zero = _mm256_setzero_si256();
- for (int ht = 0; ht < height; ht += 2) {
- const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)ref);
- const __m256i src_line1 =
- _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
- const __m256i s0 = _mm256_sad_epu8(src_line0, zero);
- const __m256i s1 = _mm256_sad_epu8(src_line1, zero);
+ for (int ht = 0; ht < height; ht += 4) {
+ __m256i src[4];
+ // Load source data.
+ load_from_src_buf(ref, &src[0], ref_stride);
- __m128i result0 = _mm_add_epi16(_mm256_castsi256_si128(s0),
- _mm256_extractf128_si256(s0, 1));
- __m128i result1 = _mm_add_epi16(_mm256_castsi256_si128(s1),
- _mm256_extractf128_si256(s1, 1));
- __m128i result2 = _mm_add_epi16(result0, _mm_srli_si128(result0, 8));
- __m128i result3 = _mm_add_epi16(result1, _mm_srli_si128(result1, 8));
+ // s00 x x x s01 x x x s02 x x x s03 x x x
+ const __m256i r0 = _mm256_sad_epu8(src[0], zero);
+ // s10 x x x s11 x x x s12 x x x s13 x x x
+ const __m256i r1 = _mm256_sad_epu8(src[1], zero);
+ // s20 x x x s21 x x x s22 x x x s23 x x x
+ const __m256i r2 = _mm256_sad_epu8(src[2], zero);
+ // s30 x x x s31 x x x s32 x x x s33 x x x
+ const __m256i r3 = _mm256_sad_epu8(src[3], zero);
- vbuf[ht] = _mm_cvtsi128_si32(result2) >> norm_factor;
- vbuf[ht + 1] = _mm_cvtsi128_si32(result3) >> norm_factor;
- ref += (2 * ref_stride);
+ CALC_TOT_SAD_AND_STORE
+ vbuf += 4;
+ ref += ref_stride << 2;
}
} else if (width == 16) {
- aom_int_pro_col_sse2(vbuf, ref, ref_stride, width, height, norm_factor);
+ aom_int_pro_col_16wd_avx2(vbuf, ref, ref_stride, height, norm_factor);
}
}