Optimize avg_8x8 sse2 and avx2 intrinsics
Microbenchmark results.
+--------------+-----------+-------+-------+
| | Microarch | AVX2 | SSE2 |
+--------------+-----------+-------+-------+
| | Zen 2 | x1.22 | x2.66 |
| avg_8x8_quad +-----------+-------+-------+
| | Skylake | x1.38 | x2.89 |
+--------------+-----------+-------+-------+
| | Zen 2 | N/A | x1.03 |
| avg_8x8 +-----------+-------+-------+
| | Skylake | N/A | x1.03 |
+--------------+-----------+-------+-------+
| | Zen 2 | N/A | x1.00 |
| avg_4x4 +-----------+-------+-------+
| | Skylake | N/A | x1.03 |
+--------------+-----------+-------+-------+
Also improve test coverage by testing different offsets.
Change-Id: I290385a19d3ac2cc470a97b4c335e5b2abb9b966
diff --git a/aom_dsp/x86/avg_intrin_avx2.c b/aom_dsp/x86/avg_intrin_avx2.c
index 6f374ee..6f4436f 100644
--- a/aom_dsp/x86/avg_intrin_avx2.c
+++ b/aom_dsp/x86/avg_intrin_avx2.c
@@ -505,52 +505,42 @@
}
}
-static INLINE __m256i calc_avg_8x8_dual_avx2(const uint8_t *s, int p) {
- const __m256i s0 =
- _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s)));
- const __m256i s1 =
- _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + p)));
- const __m256i s2 =
- _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 2 * p)));
- const __m256i s3 =
- _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 3 * p)));
- const __m256i sum0 =
- _mm256_add_epi16(_mm256_add_epi16(s0, s1), _mm256_add_epi16(s2, s3));
- const __m256i s4 =
- _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 4 * p)));
- const __m256i s5 =
- _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 5 * p)));
- const __m256i s6 =
- _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 6 * p)));
- const __m256i s7 =
- _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 7 * p)));
- const __m256i sum1 =
- _mm256_add_epi16(_mm256_add_epi16(s4, s5), _mm256_add_epi16(s6, s7));
-
- // The result of two 8x8 sub-blocks in 16x16 block.
- return _mm256_add_epi16(sum0, sum1);
+static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
+ __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
+ a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
+ return a;
}
void aom_avg_8x8_quad_avx2(const uint8_t *s, int p, int x16_idx, int y16_idx,
int *avg) {
- // Process 1st and 2nd 8x8 sub-blocks in a 16x16 block.
- const uint8_t *s_tmp = s + y16_idx * p + x16_idx;
- __m256i result_0 = calc_avg_8x8_dual_avx2(s_tmp, p);
+ const uint8_t *s_y0 = s + y16_idx * p + x16_idx;
+ const uint8_t *s_y1 = s_y0 + 8 * p;
+ __m256i sum0, sum1, s0, s1, s2, s3, u0;
+ u0 = _mm256_setzero_si256();
+ s0 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1, s_y0), u0);
+ s1 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + p, s_y0 + p), u0);
+ s2 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 2 * p, s_y0 + 2 * p), u0);
+ s3 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 3 * p, s_y0 + 3 * p), u0);
+ sum0 = _mm256_add_epi16(s0, s1);
+ sum1 = _mm256_add_epi16(s2, s3);
+ s0 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 4 * p, s_y0 + 4 * p), u0);
+ s1 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 5 * p, s_y0 + 5 * p), u0);
+ s2 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 6 * p, s_y0 + 6 * p), u0);
+ s3 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 7 * p, s_y0 + 7 * p), u0);
+ sum0 = _mm256_add_epi16(sum0, _mm256_add_epi16(s0, s1));
+ sum1 = _mm256_add_epi16(sum1, _mm256_add_epi16(s2, s3));
+ sum0 = _mm256_add_epi16(sum0, sum1);
- // Process 3rd and 4th 8x8 sub-blocks in a 16x16 block.
- s_tmp = s + ((y16_idx + 8) * p) + x16_idx;
- __m256i result_1 = calc_avg_8x8_dual_avx2(s_tmp, p);
-
- const __m256i constant_32 = _mm256_set1_epi16(32);
- result_0 = _mm256_hadd_epi16(result_0, result_1);
- result_1 = _mm256_adds_epu16(result_0, _mm256_srli_si256(result_0, 4));
- result_0 = _mm256_adds_epu16(result_1, _mm256_srli_si256(result_1, 2));
- result_0 = _mm256_adds_epu16(result_0, constant_32);
- result_0 = _mm256_srli_epi16(result_0, 6);
- avg[0] = _mm_extract_epi16(_mm256_castsi256_si128(result_0), 0);
- avg[1] = _mm_extract_epi16(_mm256_extracti128_si256(result_0, 1), 0);
- avg[2] = _mm_extract_epi16(_mm256_castsi256_si128(result_0), 4);
- avg[3] = _mm_extract_epi16(_mm256_extracti128_si256(result_0, 1), 4);
+ // (avg + 32) >> 6
+ __m256i rounding = _mm256_set1_epi32(32);
+ sum0 = _mm256_add_epi32(sum0, rounding);
+ sum0 = _mm256_srli_epi32(sum0, 6);
+ __m128i lo = _mm256_castsi256_si128(sum0);
+ __m128i hi = _mm256_extracti128_si256(sum0, 1);
+ avg[0] = _mm_cvtsi128_si32(lo);
+ avg[1] = _mm_extract_epi32(lo, 2);
+ avg[2] = _mm_cvtsi128_si32(hi);
+ avg[3] = _mm_extract_epi32(hi, 2);
}
void aom_int_pro_row_avx2(int16_t *hbuf, const uint8_t *ref,
diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index 9657ecc..c48da15 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -14,6 +14,7 @@
#include "config/aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
#include "aom_dsp/x86/bitdepth_conversion_sse2.h"
+#include "aom_dsp/x86/mem_sse2.h"
#include "aom_ports/mem.h"
void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
@@ -95,39 +96,61 @@
}
unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) {
- __m128i s0, s1, u0;
+ __m128i sum0, sum1, s0, s1, s2, s3, u0;
unsigned int avg = 0;
u0 = _mm_setzero_si128();
- s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
+ s0 = loadh_epi64((const __m128i *)(s + p),
+ _mm_loadl_epi64((const __m128i *)(s)));
+ s1 = loadh_epi64((const __m128i *)(s + 3 * p),
+ _mm_loadl_epi64((const __m128i *)(s + 2 * p)));
+ s2 = loadh_epi64((const __m128i *)(s + 5 * p),
+ _mm_loadl_epi64((const __m128i *)(s + 4 * p)));
+ s3 = loadh_epi64((const __m128i *)(s + 7 * p),
+ _mm_loadl_epi64((const __m128i *)(s + 6 * p)));
+ s0 = _mm_sad_epu8(s0, u0);
+ s1 = _mm_sad_epu8(s1, u0);
+ s2 = _mm_sad_epu8(s2, u0);
+ s3 = _mm_sad_epu8(s3, u0);
- s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
- s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
- s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
- avg = _mm_extract_epi16(s0, 0);
+ sum0 = _mm_add_epi16(s0, s1);
+ sum1 = _mm_add_epi16(s2, s3);
+ sum0 = _mm_add_epi16(sum0, sum1);
+ sum0 = _mm_add_epi16(sum0, _mm_srli_si128(sum0, 8));
+ avg = _mm_cvtsi128_si32(sum0);
return (avg + 32) >> 6;
}
+void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) {
+ __m128i sum0, sum1, s0, s1, s2, s3, u0;
+ u0 = _mm_setzero_si128();
+ s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s)), u0);
+ s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + p)), u0);
+ s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 2 * p)), u0);
+ s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 3 * p)), u0);
+ sum0 = _mm_add_epi16(s0, s1);
+ sum1 = _mm_add_epi16(s2, s3);
+ s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 4 * p)), u0);
+ s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 5 * p)), u0);
+ s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 6 * p)), u0);
+ s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 7 * p)), u0);
+ sum0 = _mm_add_epi16(sum0, _mm_add_epi16(s0, s1));
+ sum1 = _mm_add_epi16(sum1, _mm_add_epi16(s2, s3));
+ sum0 = _mm_add_epi16(sum0, sum1);
+
+ // (avg + 32) >> 6
+ __m128i rounding = _mm_set1_epi32(32);
+ sum0 = _mm_add_epi32(sum0, rounding);
+ sum0 = _mm_srli_epi32(sum0, 6);
+ avg[0] = _mm_cvtsi128_si32(sum0);
+ avg[1] = _mm_extract_epi16(sum0, 4);
+}
+
void aom_avg_8x8_quad_sse2(const uint8_t *s, int p, int x16_idx, int y16_idx,
int *avg) {
- for (int k = 0; k < 4; k++) {
- const int x8_idx = x16_idx + ((k & 1) << 3);
- const int y8_idx = y16_idx + ((k >> 1) << 3);
- const uint8_t *s_tmp = s + y8_idx * p + x8_idx;
- avg[k] = aom_avg_8x8_sse2(s_tmp, p);
+ const uint8_t *s_ptr = s + y16_idx * p + x16_idx;
+ for (int k = 0; k < 2; k++) {
+ calc_avg_8x8_dual_sse2(s_ptr, p, avg + k * 2);
+ s_ptr += 8 * p;
}
}
@@ -135,17 +158,14 @@
__m128i s0, s1, u0;
unsigned int avg = 0;
u0 = _mm_setzero_si128();
- s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
-
- s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
- s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
- avg = _mm_extract_epi16(s0, 0);
+ s0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s)),
+ _mm_cvtsi32_si128(*(const int *)(s + p)));
+ s1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s + p * 2)),
+ _mm_cvtsi32_si128(*(const int *)(s + p * 3)));
+ s0 = _mm_sad_epu8(s0, u0);
+ s1 = _mm_sad_epu8(s1, u0);
+ s0 = _mm_add_epi16(s0, s1);
+ avg = _mm_cvtsi128_si32(s0);
return (avg + 8) >> 4;
}
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 8922f34..4e86f06 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -43,7 +43,9 @@
protected:
// Handle blocks up to 4 blocks 64x64 with stride up to 128
static const int kDataAlignment = 16;
- static const int kDataBlockSize = 128 * 128;
+ static const int kDataBlockWidth = 128;
+ static const int kDataBlockHeight = 128;
+ static const int kDataBlockSize = kDataBlockWidth * kDataBlockHeight;
virtual void SetUp() {
const testing::TestInfo *const test_info =
@@ -236,13 +238,11 @@
using AverageTestBase<Pixel>::FillConstant;
using AverageTestBase<Pixel>::FillRandom;
- void CheckAverages(int iterations) {
+ void CheckAveragesAt(int iterations, int x16_idx, int y16_idx) {
ASSERT_EQ(sizeof(Pixel), 1u);
const int block_size = GET_PARAM(4);
(void)block_size;
int expected[4] = { 0 };
- int x16_idx = 0;
- int y16_idx = 0;
// The reference frame, but not the source frame, may be unaligned for
// certain types of searches.
@@ -285,19 +285,25 @@
}
}
+ void CheckAverages() {
+ for (int x16_idx = 0; x16_idx < this->kDataBlockWidth / 8; x16_idx += 2)
+ for (int y16_idx = 0; y16_idx < this->kDataBlockHeight / 8; y16_idx += 2)
+ CheckAveragesAt(1, x16_idx, y16_idx);
+ }
+
void TestConstantValue(Pixel value) {
FillConstant(value);
- CheckAverages(1);
+ CheckAverages();
}
void TestRandom() {
FillRandom();
- CheckAverages(1);
+ CheckAverages();
}
void TestSpeed() {
FillRandom();
- CheckAverages(1000000);
+ CheckAveragesAt(1000000, 0, 0);
}
int64_t ref_elapsed_time_ = 0;