Speed up dr_prediction_z2_HxW_avx2
profiling against below command:
aomenc park_joy.yuv -w 1920 -h 1080 --i420 -b 8 --cpu-used=3 --cq-level=32 --limit=50 -p 1 -o parkjoy.iv
before the patch:
1.66% 1.65% aomenc aomenc [.] av1_dr_prediction_z2_avx2
|
--1.65%--build_intra_predictors
av1_dr_prediction_z2_avx2
with the patch:
0.59% 0.59% aomenc aomenc [.] av1_dr_prediction_z2_avx2
|
--0.58%--build_intra_predictors
av1_dr_prediction_z2_avx2
Change-Id: I1c486c5f16b598e7cfc4fa67d1efd6577bdfb793
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index 357f69e..1a07e24 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -3569,7 +3569,18 @@
{ 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
{ 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
};
-
+/* clang-format off */
+static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = {
+ { -1, 0, 0, 0, 0, 0, 0, 0},
+ { -1, -1, 0, 0, 0, 0, 0, 0},
+ { -1, -1, -1, 0, 0, 0, 0, 0},
+ { -1, -1, -1, -1, 0, 0, 0, 0},
+ { -1, -1, -1, -1, -1, 0, 0, 0},
+ { -1, -1, -1, -1, -1, -1, 0, 0},
+ { -1, -1, -1, -1, -1, -1, -1, 0},
+ { -1, -1, -1, -1, -1, -1, -1, -1},
+};
+/* clang-format on */
static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2(
int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
int dx) {
@@ -4197,27 +4208,51 @@
base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
- base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
- _mm256_store_si256((__m256i *)base_y_c, base_y_c256); /**/
- a0_y = _mm256_setr_epi16(
- left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
- left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
- left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
- left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
- left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
- left[base_y_c[15]]);
- base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
- _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+ base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256);
+ int16_t min_y = _mm256_extract_epi16(base_y_c256, 15);
+ int16_t max_y = _mm256_extract_epi16(base_y_c256, 0);
+ int16_t offset_diff = max_y - min_y;
- a1_y = _mm256_setr_epi16(
- left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
- left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
- left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
- left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
- left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
- left[base_y_c[15]]);
+ if (offset_diff < 16) {
+ __m256i min_y256 = _mm256_set1_epi16(min_y);
+ __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256);
+ __m128i base_y_offset128 =
+ _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0),
+ _mm256_extracti128_si256(base_y_offset, 1));
+
+ __m128i a0_y128 = _mm_maskload_epi32(
+ (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]);
+ __m128i a1_y128 =
+ _mm_maskload_epi32((int *)(left + min_y + 1),
+ *(__m128i *)LoadMaskz2[offset_diff / 4]);
+ a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128);
+ a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128);
+ a0_y = _mm256_cvtepu8_epi16(a0_y128);
+ a1_y = _mm256_cvtepu8_epi16(a1_y128);
+ } else {
+ base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+ _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+ a0_y = _mm256_setr_epi16(
+ left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+ left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+ left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+ left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+ left[base_y_c[15]]);
+ base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
+ _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+ a1_y = _mm256_setr_epi16(
+ left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+ left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+ left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+ left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+ left[base_y_c[15]]);
+ }
shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index 128baa2..16ace15 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -1377,7 +1377,10 @@
int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
if (use_filter_intra) need_bottom = 0;
if (is_dr_mode) need_bottom = p_angle > 180;
- const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0);
+ // the avx2 dr_prediction_z2 may read at most 3 extra bytes,
+ // due to the avx2 mask load is with dword granularity.
+ // so we initialize 3 extra bytes to silence valgrind complain.
+ const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 3);
i = 0;
if (n_left_px > 0) {
for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];