Speed up wiener search with aligned loads/stores
Speed 4 encoder profiling result showed:
3.15%--acc_stat_avx2 (inlined)
|
|--1.44%--yy_loadu_256 (inlined)
| _mm256_loadu_si256 (inlined)
|
--1.18%--yy_storeu_256 (inlined)
_mm256_storeu_si256 (inlined)
Forced the buffer alignment to use aligned loads/stores. Now, we have:
1.39%--acc_stat_avx2 (inlined)
|
--0.58%--yy_store_256 (inlined)
_mm256_store_si256 (inlined)
The speed test showed ~1.3% speedup at speed 4.
Change-Id: Ib7e4ad9b1ff0c602a2cd8bf61a6ffd5f3e26152d
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index d00fca0..5d4097c 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -22,9 +22,9 @@
const __m128i *shuffle, const __m256i *kl) {
const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
const __m256i d0 = _mm256_madd_epi16(*kl, _mm256_cvtepu8_epi16(s));
- const __m256i dst0 = yy_loadu_256(dst);
+ const __m256i dst0 = yy_load_256(dst);
const __m256i r0 = _mm256_add_epi32(dst0, d0);
- yy_storeu_256(dst, r0);
+ yy_store_256(dst, r0);
}
static INLINE void acc_stat_win7_one_line_avx2(
@@ -74,7 +74,9 @@
int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
- int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+
+ DECLARE_ALIGNED(32, int32_t,
+ H_int32[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } };
int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
int32_t sumX = 0;
@@ -423,7 +425,9 @@
int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
- int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ DECLARE_ALIGNED(
+ 32, int32_t,
+ H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } };
int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
int32_t sumX = 0;