Improvement in av1_calc_proj_params_avx2 function
Implemented a common approach for 32 and 64 bit build to store
the values of H and C matrices.
Change-Id: I580a03dfd2dfd188c34ce4f001f4116da77eada7
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index 627ee6c..f8703a2 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -640,8 +640,8 @@
const uint8_t *src = src8;
const uint8_t *dat = dat8;
__m256i h00, h01, h11, c0, c1;
- h00 = _mm256_setzero_si256();
- h01 = h11 = c0 = c1 = h00;
+ const __m256i zero = _mm256_setzero_si256();
+ h01 = h11 = c0 = c1 = h00 = zero;
for (int i = 0; i < height; ++i) {
for (int j = 0; j < width; j += 8) {
@@ -688,46 +688,36 @@
c1 = _mm256_add_epi64(c1, c1_odd);
}
}
- const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1),
- _mm256_castsi256_si128(h00));
- const __m128i h00_val =
- _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8));
- const __m128i h01_128bit = _mm_add_epi64(_mm256_extracti128_si256(h01, 1),
- _mm256_castsi256_si128(h01));
- const __m128i h01_val =
- _mm_add_epi64(h01_128bit, _mm_srli_si128(h01_128bit, 8));
+ __m256i c_low = _mm256_unpacklo_epi64(c0, c1);
+ const __m256i c_high = _mm256_unpackhi_epi64(c0, c1);
+ c_low = _mm256_add_epi64(c_low, c_high);
+ const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1),
+ _mm256_castsi256_si128(c_low));
- const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1),
- _mm256_castsi256_si128(h11));
- const __m128i h11_val =
- _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8));
+ __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01);
+ const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01);
+ h0x_low = _mm256_add_epi64(h0x_low, h0x_high);
+ const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1),
+ _mm256_castsi256_si128(h0x_low));
- const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1),
- _mm256_castsi256_si128(c0));
- const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8));
+ // Using the symmetric properties of H, calculations of H[1][0] are not
+ // needed.
+ __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11);
+ const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11);
+ h1x_low = _mm256_add_epi64(h1x_low, h1x_high);
+ const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1),
+ _mm256_castsi256_si128(h1x_low));
- const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1),
- _mm256_castsi256_si128(c1));
- const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8));
-
-#if ARCH_X86_64
- H[0][0] = _mm_extract_epi64(h00_val, 0);
- H[0][1] = _mm_extract_epi64(h01_val, 0);
- H[1][1] = _mm_extract_epi64(h11_val, 0);
- C[0] = _mm_extract_epi64(c0_val, 0);
- C[1] = _mm_extract_epi64(c1_val, 0);
-#else
- xx_storel_64(&H[0][0], h00_val);
- xx_storel_64(&H[0][1], h01_val);
- xx_storel_64(&H[1][1], h11_val);
- xx_storel_64(&C[0], c0_val);
- xx_storel_64(&C[1], c1_val);
-#endif
+ xx_storeu_128(C, c_128bit);
+ xx_storeu_128(H[0], h0x_128bit);
+ xx_storeu_128(H[1], h1x_128bit);
H[0][0] /= size;
H[0][1] /= size;
H[1][1] /= size;
+
+ // Since H is a symmetric matrix
H[1][0] = H[0][1];
C[0] /= size;
C[1] /= size;
@@ -745,8 +735,8 @@
const uint8_t *src = src8;
const uint8_t *dat = dat8;
__m256i h00, c0;
- h00 = _mm256_setzero_si256();
- c0 = h00;
+ const __m256i zero = _mm256_setzero_si256();
+ c0 = h00 = zero;
for (int i = 0; i < height; ++i) {
for (int j = 0; j < width; j += 8) {
@@ -782,13 +772,11 @@
_mm256_castsi256_si128(c0));
const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8));
-#if ARCH_X86_64
- H[0][0] = _mm_extract_epi64(h00_val, 0);
- C[0] = _mm_extract_epi64(c0_val, 0);
-#else
- xx_storel_64(&H[0][0], h00_val);
- xx_storel_64(&C[0], c0_val);
-#endif
+ const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero));
+ const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero));
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[0], h0x);
H[0][0] /= size;
C[0] /= size;
@@ -806,8 +794,8 @@
const uint8_t *src = src8;
const uint8_t *dat = dat8;
__m256i h11, c1;
- h11 = _mm256_setzero_si256();
- c1 = h11;
+ const __m256i zero = _mm256_setzero_si256();
+ c1 = h11 = zero;
for (int i = 0; i < height; ++i) {
for (int j = 0; j < width; j += 8) {
@@ -844,13 +832,11 @@
_mm256_castsi256_si128(c1));
const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8));
-#if ARCH_X86_64
- H[1][1] = _mm_extract_epi64(h11_val, 0);
- C[1] = _mm_extract_epi64(c1_val, 0);
-#else
- xx_storel_64(&H[1][1], h11_val);
- xx_storel_64(&C[1], c1_val);
-#endif
+ const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val);
+ const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val);
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[1], h1x);
H[1][1] /= size;
C[1] /= size;