Speed up wiener filter search in HBD case
This patch followed
https://aomedia-review.googlesource.com/c/aom/+/85363
to enforce aligned loads/stores in HBD wiener filter search code.
Change-Id: Ie21ee58a6e1abbe78b3ef25659739112b2bd42c9
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index 5d4097c..5571165 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -147,14 +147,14 @@
// Take the lower-half of d0, extend to u64, add it on to dst (H)
const __m256i d0l = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 0));
// d0l = [a b] [c d] as u64
- const __m256i dst0 = yy_loadu_256(dst);
- yy_storeu_256(dst, _mm256_add_epi64(d0l, dst0));
+ const __m256i dst0 = yy_load_256(dst);
+ yy_store_256(dst, _mm256_add_epi64(d0l, dst0));
// Take the upper-half of d0, extend to u64, add it on to dst (H)
const __m256i d0h = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 1));
// d0h = [e f] [g h] as u64
- const __m256i dst1 = yy_loadu_256(dst + 4);
- yy_storeu_256(dst + 4, _mm256_add_epi64(d0h, dst1));
+ const __m256i dst1 = yy_load_256(dst + 4);
+ yy_store_256(dst + 4, _mm256_add_epi64(d0h, dst1));
}
static INLINE void acc_stat_highbd_win7_one_line_avx2(
@@ -218,7 +218,7 @@
find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } };
- int64_t H_int[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ DECLARE_ALIGNED(32, int64_t, H_int[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } };
int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
int32_t sumX = 0;
const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
@@ -318,7 +318,9 @@
find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
- int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ DECLARE_ALIGNED(
+ 32, int64_t,
+ H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } };
int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
int32_t sumX = 0;
const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;