Add blend_a64_mask avx2 code 1.opt blend_a64_mask sse4 code 2.add blend_a64_mask avx2 code speed up about %0.5 without rd change test sequence: BasketballDrill_832x480_50.y4m test command line:./aomenc --cpu-used=1 --psnr -D \ -q --end-usage=vbr --target-bitrate=800 --limit=20 \ BasketballDrill_832x480_50.y4m -otest.webm Change-Id: I3c3d1f87f56d2315e1b0353d8fac6efdb7ac2152
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index 8bd8911..e45ea4d 100755 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -548,7 +548,7 @@ add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby"; add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h"; add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h"; -specialize "aom_blend_a64_mask", qw/sse4_1/; +specialize "aom_blend_a64_mask", qw/sse4_1 avx2/; specialize "aom_blend_a64_hmask", qw/sse4_1 neon/; specialize "aom_blend_a64_vmask", qw/sse4_1 neon/;
diff --git a/aom_dsp/x86/blend_a64_mask_avx2.c b/aom_dsp/x86/blend_a64_mask_avx2.c index d64fca8..67fb4d3 100644 --- a/aom_dsp/x86/blend_a64_mask_avx2.c +++ b/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -20,6 +20,7 @@ #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_dsp/x86/blend_sse4.h" #include "aom_dsp/x86/blend_mask_sse4.h" #include "config/aom_dsp_rtcd.h" @@ -399,3 +400,501 @@ } } } + +static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1, + const __m256i *v_m0_b, + const __m256i *v_m1_b, + const int32_t bits) { + const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0)); + const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1)); + const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8); + const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8); + + const __m256i v_p0_w = + _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b), + _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits); + const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w); + const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8); + return v_res; +} + +static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1, + const __m256i *v_m0_b, + const __m256i *v_m1_b, + const int32_t bits) { + const __m256i v_s0_b = yy_loadu_256(src0); + const __m256i v_s1_b = yy_loadu_256(src1); + + const __m256i v_p0_w = + _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b), + _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b)); + const __m256i v_p1_w = + _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b), + _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b)); + + const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits); + const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits); + const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w); + return v_res; +} + +static INLINE void blend_a64_mask_sx_sy_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m256i v_zmask_b = _mm256_set1_epi16(0xFF); + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + const __m256i v_ral_b = yy_loadu_256(mask); + const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride); + const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b); + const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b); + const __m256i v_rvsbl_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b); + const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w); + + const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2); + const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b, + AOM_BLEND_A64_ROUND_BITS); + + xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b)); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_sy_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m256i v_zmask_b = _mm256_set1_epi16(0xFF); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_ral_b = yy_loadu_256(mask + 2 * c); + const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32); + const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c); + const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32); + const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b); + const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b); + const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b); + const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b); + const __m256i v_rvsbl_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b); + const __m256i v_rvsbh_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b); + const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w); + const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w); + + const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2); + const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2); + const __m256i v_m0_b = + _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_sy_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_ra_b = xx_loadu_128(mask); + const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + break; + } +} + +static INLINE void blend_a64_mask_sx_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m256i v_zmask_b = _mm256_set1_epi16(0xff); + do { + const __m256i v_rl_b = yy_loadu_256(mask); + const __m256i v_al_b = + _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1)); + + const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b); + const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256()); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b, + AOM_BLEND_A64_ROUND_BITS); + + xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b)); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle); + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_r0_b = yy_loadu_256(mask + 2 * c); + const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32); + const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b); + const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b); + const __m256i v_al_b = + _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8)); + const __m256i v_ah_b = + _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8)); + + const __m256i v_m0_b = + _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_r_b = xx_loadl_64(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_r_b = xx_loadu_128(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + break; + } +} + +static INLINE void blend_a64_mask_sy_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + const __m128i v_ra_b = xx_loadu_128(mask); + const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst, v_res_b); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sy_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_ra_b = yy_loadu_256(mask + c); + const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride); + const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sy_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + switch (w) { + case 4: + do { + const __m128i v_ra_b = xx_loadl_32(mask); + const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } +} + +static INLINE void blend_a64_mask_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_m0_b = yy_loadu_256(mask + c); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_m0_b = xx_loadl_32(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_m0_b = xx_loadl_64(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 16: + do { + const __m128i v_m0_b = xx_loadu_128(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst, v_res_b); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + default: + blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } +} + +void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, + int h, int subx, int suby) { + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, w, h, subx, suby); + } else { + if (subx & suby) { + blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else if (subx) { + blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else if (suby) { + blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else { + blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, w, h); + } + } +}
diff --git a/aom_dsp/x86/blend_a64_mask_sse4.c b/aom_dsp/x86/blend_a64_mask_sse4.c index 55af9d8..9d6b4c2 100644 --- a/aom_dsp/x86/blend_a64_mask_sse4.c +++ b/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -33,19 +33,13 @@ const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; - + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_m0_b = xx_loadl_32(mask); - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); - + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); dst += dst_stride; @@ -60,19 +54,13 @@ const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; - + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_m0_b = xx_loadl_64(mask); - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); - + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); dst += dst_stride; @@ -86,23 +74,17 @@ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { - const __m128i v_m0l_b = xx_loadl_64(mask + c); - const __m128i v_m0h_b = xx_loadl_64(mask + c + 8); - const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b); - const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b); - const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); - const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); + const __m128i v_m0_b = xx_loadu_128(mask + c); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); - const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); - - const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); } @@ -121,23 +103,20 @@ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_r_b = xx_loadl_64(mask); - const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); - + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); dst += dst_stride; @@ -151,22 +130,20 @@ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_r_b = xx_loadu_128(mask); - const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); @@ -181,28 +158,24 @@ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { - const __m128i v_rl_b = xx_loadu_128(mask + 2 * c); - const __m128i v_rh_b = xx_loadu_128(mask + 2 * c + 16); - const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1)); - const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1)); + const __m128i v_r0_b = xx_loadu_128(mask + 2 * c); + const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b); + const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b); - const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b); - const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); - const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); - - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); - const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); - - const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); } @@ -221,21 +194,18 @@ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { const __m128i v_ra_b = xx_loadl_32(mask); const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); - const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); @@ -250,21 +220,16 @@ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_ra_b = xx_loadl_64(mask); const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); - const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); - - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); @@ -279,26 +244,18 @@ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zero = _mm_setzero_si128(); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { const __m128i v_ra_b = xx_loadu_128(mask + c); const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride); - const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b); - const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero); - const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); - const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); - - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); - const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); - - const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); } @@ -317,27 +274,24 @@ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); (void)w; do { const __m128i v_ra_b = xx_loadl_64(mask); const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); - const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); - const __m128i v_rvsb_w = - _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); - const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); - + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); @@ -352,27 +306,25 @@ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); (void)w; do { const __m128i v_ra_b = xx_loadu_128(mask); const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); - const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); - const __m128i v_rvsb_w = - _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); - const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); - + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); @@ -389,8 +341,8 @@ const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { @@ -411,14 +363,11 @@ const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2); const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2); - const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); - const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); + const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); - const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); - - const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); }
diff --git a/aom_dsp/x86/blend_a64_vmask_sse4.c b/aom_dsp/x86/blend_a64_vmask_sse4.c index 59506bd..0649102 100644 --- a/aom_dsp/x86/blend_a64_vmask_sse4.c +++ b/aom_dsp/x86/blend_a64_vmask_sse4.c
@@ -39,7 +39,7 @@ const __m128i v_m0_w = _mm_set1_epi16(*mask); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w); const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); @@ -64,7 +64,7 @@ const __m128i v_m0_w = _mm_set1_epi16(*mask); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w); const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); @@ -90,9 +90,9 @@ const __m128i v_m0_w = _mm_set1_epi16(*mask); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); for (c = 0; c < w; c += 16) { - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0_w, v_m1_w); + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w); const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0_w, v_m1_w); + blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w); const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
diff --git a/aom_dsp/x86/blend_sse4.h b/aom_dsp/x86/blend_sse4.h index 4880438..f7be7cf 100644 --- a/aom_dsp/x86/blend_sse4.h +++ b/aom_dsp/x86/blend_sse4.h
@@ -14,20 +14,39 @@ #include "aom_dsp/blend.h" #include "aom_dsp/x86/synonyms.h" +static const uint8_t g_blend_a64_mask_shuffle[32] = { + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, +}; ////////////////////////////////////////////////////////////////////////////// // Common kernels ////////////////////////////////////////////////////////////////////////////// static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i *v_m0_w, const __m128i *v_m1_w) { const __m128i v_s0_b = xx_loadl_32(src0); const __m128i v_s1_b = xx_loadl_32(src1); const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); - const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); - const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_w, const __m128i *v_m1_w) { + const __m128i v_s0_b = xx_loadl_64(src0); + const __m128i v_s1_b = xx_loadl_64(src1); + const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); + const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); @@ -36,21 +55,49 @@ return v_res_w; } -static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w) { +static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadl_32(src0); + const __m128i v_s1_b = xx_loadl_32(src1); + + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); + return v_res; +} + +static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { const __m128i v_s0_b = xx_loadl_64(src0); const __m128i v_s1_b = xx_loadl_64(src1); - const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); - const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); - const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); - const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); - const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); + return v_res; +} - const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); +static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadu_128(src0); + const __m128i v_s1_b = xx_loadu_128(src1); - return v_res_w; + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b), + _mm_unpackhi_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w); + return v_res; } typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
diff --git a/aom_dsp/x86/synonyms_avx2.h b/aom_dsp/x86/synonyms_avx2.h index 39f371f..b4458aa 100644 --- a/aom_dsp/x86/synonyms_avx2.h +++ b/aom_dsp/x86/synonyms_avx2.h
@@ -61,4 +61,8 @@ return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); } +static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) { + const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1); + return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256()); +} #endif // AOM_DSP_X86_SYNONYMS_AVX2_H_
diff --git a/test/blend_a64_mask_test.cc b/test/blend_a64_mask_test.cc index 4d4f070..66ca6fc 100644 --- a/test/blend_a64_mask_test.cc +++ b/test/blend_a64_mask_test.cc
@@ -43,10 +43,16 @@ virtual ~BlendA64MaskTest() {} - virtual void Execute(const SrcPixel *p_src0, const SrcPixel *p_src1) = 0; + virtual void Execute(const SrcPixel *p_src0, const SrcPixel *p_src1, + int run_times) = 0; template <typename Pixel> - void GetSources(Pixel **src0, Pixel **src1, Pixel * /*dst*/) { + void GetSources(Pixel **src0, Pixel **src1, Pixel * /*dst*/, int run_times) { + if (run_times > 1) { + *src0 = src0_; + *src1 = src1_; + return; + } switch (this->rng_(3)) { case 0: // Separate sources *src0 = src0_; @@ -68,19 +74,20 @@ } } - void GetSources(uint16_t **src0, uint16_t **src1, uint8_t * /*dst*/) { + void GetSources(uint16_t **src0, uint16_t **src1, uint8_t * /*dst*/, + int /*run_times*/) { *src0 = src0_; *src1 = src1_; } uint8_t Rand1() { return this->rng_.Rand8() & 1; } - void RunTest() { - w_ = 4 << this->rng_(MAX_SB_SIZE_LOG2 - 1); - h_ = 4 << this->rng_(MAX_SB_SIZE_LOG2 - 1); - - subx_ = Rand1(); - suby_ = Rand1(); + void RunOneTest(int block_size, int subx, int suby, int run_times) { + w_ = block_size_wide[block_size]; + h_ = block_size_high[block_size]; + run_times = run_times > 1 ? run_times / w_ : 1; + subx_ = subx; + suby_ = suby; dst_offset_ = this->rng_(33); dst_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_; @@ -100,19 +107,26 @@ p_src0 = src0_; p_src1 = src1_; - GetSources(&p_src0, &p_src1, &dst_ref_[0]); + GetSources(&p_src0, &p_src1, &dst_ref_[0], run_times); - Execute(p_src0, p_src1); + Execute(p_src0, p_src1, run_times); for (int r = 0; r < h_; ++r) { for (int c = 0; c < w_; ++c) { ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c], dst_tst_[dst_offset_ + r * dst_stride_ + c]) - << w_ << "x" << h_ << " r: " << r << " c: " << c; + << w_ << "x" << h_ << " subx " << subx_ << " suby " << suby_ + << " r: " << r << " c: " << c; } } } + void RunTest(int block_size, int run_times) { + subx_ = Rand1(); + suby_ = Rand1(); + RunOneTest(block_size, subx_, suby_, run_times); + } + DstPixel dst_ref_[kBufSize]; DstPixel dst_tst_[kBufSize]; uint32_t dst_stride_; @@ -148,19 +162,37 @@ class BlendA64MaskTest8B : public BlendA64MaskTest<F8B, uint8_t, uint8_t> { protected: - void Execute(const uint8_t *p_src0, const uint8_t *p_src1) { - params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, p_src0 + src0_offset_, - src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, - kMaxMaskWidth, w_, h_, subx_, suby_); - ASM_REGISTER_STATE_CHECK(params_.tst_func( - dst_tst_ + dst_offset_, dst_stride_, p_src0 + src0_offset_, - src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, kMaxMaskWidth, - w_, h_, subx_, suby_)); + void Execute(const uint8_t *p_src0, const uint8_t *p_src1, int run_times) { + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, + p_src0 + src0_offset_, src0_stride_, + p_src1 + src1_offset_, src1_stride_, mask_, + kMaxMaskWidth, w_, h_, subx_, suby_); + } + aom_usec_timer_mark(&timer); + const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + params_.tst_func(dst_tst_ + dst_offset_, dst_stride_, + p_src0 + src0_offset_, src0_stride_, + p_src1 + src1_offset_, src1_stride_, mask_, + kMaxMaskWidth, w_, h_, subx_, suby_); + } + aom_usec_timer_mark(&timer); + const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer)); + if (run_times > 1) { + printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_, + time1, time2); + printf("(%3.2f)\n", time1 / time2); + } } }; TEST_P(BlendA64MaskTest8B, RandomValues) { for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + int bsize = rng_.Rand8() % BLOCK_SIZES_ALL; for (int i = 0; i < kBufSize; ++i) { dst_ref_[i] = rng_.Rand8(); dst_tst_[i] = rng_.Rand8(); @@ -172,12 +204,13 @@ for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1); - RunTest(); + RunTest(bsize, 1); } } TEST_P(BlendA64MaskTest8B, ExtremeValues) { for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + int bsize = rng_.Rand8() % BLOCK_SIZES_ALL; for (int i = 0; i < kBufSize; ++i) { dst_ref_[i] = rng_(2) + 254; dst_tst_[i] = rng_(2) + 254; @@ -188,14 +221,39 @@ for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1; - RunTest(); + RunTest(bsize, 1); } } +TEST_P(BlendA64MaskTest8B, DISABLED_Speed) { + const int kRunTimes = 10000000; + for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { + for (int i = 0; i < kBufSize; ++i) { + dst_ref_[i] = rng_.Rand8(); + dst_tst_[i] = rng_.Rand8(); + src0_[i] = rng_.Rand8(); + src1_[i] = rng_.Rand8(); + } + + for (int i = 0; i < kMaxMaskSize; ++i) + mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1); + + RunOneTest(bsize, 1, 1, kRunTimes); + RunOneTest(bsize, 1, 0, kRunTimes); + RunOneTest(bsize, 0, 1, kRunTimes); + RunOneTest(bsize, 0, 0, kRunTimes); + } +} #if HAVE_SSE4_1 INSTANTIATE_TEST_CASE_P(SSE4_1, BlendA64MaskTest8B, ::testing::Values(TestFuncs( aom_blend_a64_mask_c, aom_blend_a64_mask_sse4_1))); +#endif // HAVE_AVX2 + +#if HAVE_AVX2 +INSTANTIATE_TEST_CASE_P(AVX2, BlendA64MaskTest8B, + ::testing::Values(TestFuncs(aom_blend_a64_mask_sse4_1, + aom_blend_a64_mask_avx2))); #endif // HAVE_SSE4_1 ////////////////////////////////////////////////////////////////////////////// @@ -215,22 +273,40 @@ // max number of bits used by the source static const int kSrcMaxBitsMask = 0x3fff; - void Execute(const uint16_t *p_src0, const uint16_t *p_src1) { + void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) { ConvolveParams conv_params; conv_params.round_0 = ROUND0_BITS; conv_params.round_1 = COMPOUND_ROUND1_BITS; - params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, p_src0 + src0_offset_, - src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, - kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params); - ASM_REGISTER_STATE_CHECK(params_.tst_func( - dst_tst_ + dst_offset_, dst_stride_, p_src0 + src0_offset_, - src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, kMaxMaskWidth, - w_, h_, subx_, suby_, &conv_params)); + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, + p_src0 + src0_offset_, src0_stride_, + p_src1 + src1_offset_, src1_stride_, mask_, + kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params); + } + aom_usec_timer_mark(&timer); + const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + params_.tst_func(dst_tst_ + dst_offset_, dst_stride_, + p_src0 + src0_offset_, src0_stride_, + p_src1 + src1_offset_, src1_stride_, mask_, + kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params); + } + aom_usec_timer_mark(&timer); + const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer)); + if (run_times > 1) { + printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_, + time1, time2); + printf("(%3.2f)\n", time1 / time2); + } } }; TEST_P(BlendA64MaskTest8B_d16, RandomValues) { for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + int bsize = rng_.Rand8() % BLOCK_SIZES_ALL; for (int i = 0; i < kBufSize; ++i) { dst_ref_[i] = rng_.Rand8(); dst_tst_[i] = rng_.Rand8(); @@ -242,12 +318,13 @@ for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1); - RunTest(); + RunTest(bsize, 1); } } TEST_P(BlendA64MaskTest8B_d16, ExtremeValues) { for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + int bsize = rng_.Rand8() % BLOCK_SIZES_ALL; for (int i = 0; i < kBufSize; ++i) { dst_ref_[i] = 255; dst_tst_[i] = 255; @@ -259,7 +336,7 @@ for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = AOM_BLEND_A64_MAX_ALPHA - 1; - RunTest(); + RunTest(bsize, 1); } } @@ -297,16 +374,31 @@ class BlendA64MaskTestHBD : public BlendA64MaskTest<FHBD, uint16_t, uint16_t> { protected: - void Execute(const uint16_t *p_src0, const uint16_t *p_src1) { - params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_, - CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_, - CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, - mask_, kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_); - ASM_REGISTER_STATE_CHECK(params_.tst_func( - CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_, - CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_, - CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, mask_, - kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_)); + void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) { + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_, + CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_, + CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, + mask_, kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_); + } + aom_usec_timer_mark(&timer); + const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + params_.tst_func(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_, + CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_, + CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, + mask_, kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_); + } + aom_usec_timer_mark(&timer); + const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer)); + if (run_times > 1) { + printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_, + time1, time2); + printf("(%3.2f)\n", time1 / time2); + } } int bit_depth_; @@ -314,6 +406,7 @@ TEST_P(BlendA64MaskTestHBD, RandomValues) { for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + int bsize = rng_.Rand8() % BLOCK_SIZES_ALL; switch (rng_(3)) { case 0: bit_depth_ = 8; break; case 1: bit_depth_ = 10; break; @@ -332,12 +425,13 @@ for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1); - RunTest(); + RunTest(bsize, 1); } } TEST_P(BlendA64MaskTestHBD, ExtremeValues) { for (int iter = 0; iter < 1000 && !HasFatalFailure(); ++iter) { + int bsize = rng_.Rand8() % BLOCK_SIZES_ALL; switch (rng_(3)) { case 0: bit_depth_ = 8; break; case 1: bit_depth_ = 10; break; @@ -357,7 +451,7 @@ for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1; - RunTest(); + RunTest(bsize, 1); } } @@ -387,21 +481,37 @@ static const int kSrcMaxBitsMask = (1 << 14) - 1; static const int kSrcMaxBitsMaskHBD = (1 << 16) - 1; - void Execute(const uint16_t *p_src0, const uint16_t *p_src1) { + void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) { ConvolveParams conv_params; conv_params.round_0 = (bit_depth_ == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; conv_params.round_1 = COMPOUND_ROUND1_BITS; - - params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_, - p_src0 + src0_offset_, src0_stride_, p_src1 + src1_offset_, - src1_stride_, mask_, kMaxMaskWidth, w_, h_, subx_, suby_, - &conv_params, bit_depth_); + aom_usec_timer timer; + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_, + p_src0 + src0_offset_, src0_stride_, + p_src1 + src1_offset_, src1_stride_, mask_, + kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params, + bit_depth_); + } if (params_.tst_func) { - ASM_REGISTER_STATE_CHECK(params_.tst_func( - CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_, - p_src0 + src0_offset_, src0_stride_, p_src1 + src1_offset_, - src1_stride_, mask_, kMaxMaskWidth, w_, h_, subx_, suby_, - &conv_params, bit_depth_)); + aom_usec_timer_mark(&timer); + const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer)); + aom_usec_timer_start(&timer); + for (int i = 0; i < run_times; ++i) { + params_.tst_func(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), + dst_stride_, p_src0 + src0_offset_, src0_stride_, + p_src1 + src1_offset_, src1_stride_, mask_, + kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params, + bit_depth_); + } + aom_usec_timer_mark(&timer); + const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer)); + if (run_times > 1) { + printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_, + time1, time2); + printf("(%3.2f)\n", time1 / time2); + } } } @@ -412,6 +522,7 @@ TEST_P(BlendA64MaskTestHBD_d16, RandomValues) { if (params_.tst_func == NULL) return; for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { + int bsize = rng_.Rand8() % BLOCK_SIZES_ALL; switch (rng_(3)) { case 0: bit_depth_ = 8; break; case 1: bit_depth_ = 10; break; @@ -431,26 +542,28 @@ for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1); - RunTest(); + RunTest(bsize, 1); } } +// TODO (Scott LaVarnway), fix this test +TEST_P(BlendA64MaskTestHBD_d16, DISABLED_SaturatedValues) { + for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { + for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) { + src_max_bits_mask_ = + (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD; -TEST_P(BlendA64MaskTestHBD_d16, SaturatedValues) { - for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) { - src_max_bits_mask_ = - (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD; + for (int i = 0; i < kBufSize; ++i) { + dst_ref_[i] = 0; + dst_tst_[i] = (1 << bit_depth_) - 1; - for (int i = 0; i < kBufSize; ++i) { - dst_ref_[i] = 0; - dst_tst_[i] = (1 << bit_depth_) - 1; + src0_[i] = src_max_bits_mask_; + src1_[i] = src_max_bits_mask_; + } - src0_[i] = src_max_bits_mask_; - src1_[i] = src_max_bits_mask_; + for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = AOM_BLEND_A64_MAX_ALPHA; + + RunTest(bsize, 1); } - - for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = AOM_BLEND_A64_MAX_ALPHA; - - RunTest(); } }