Add blend_a64_mask avx2 code

1.opt blend_a64_mask sse4 code
2.add blend_a64_mask avx2 code

speed up about %0.5 without rd change

test sequence: BasketballDrill_832x480_50.y4m

test command line:./aomenc --cpu-used=1 --psnr -D \
 -q --end-usage=vbr --target-bitrate=800 --limit=20 \
 BasketballDrill_832x480_50.y4m -otest.webm

Change-Id: I3c3d1f87f56d2315e1b0353d8fac6efdb7ac2152
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 8bd8911..e45ea4d 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -548,7 +548,7 @@
 add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby";
 add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
 add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
-specialize "aom_blend_a64_mask", qw/sse4_1/;
+specialize "aom_blend_a64_mask", qw/sse4_1 avx2/;
 specialize "aom_blend_a64_hmask", qw/sse4_1 neon/;
 specialize "aom_blend_a64_vmask", qw/sse4_1 neon/;
 
diff --git a/aom_dsp/x86/blend_a64_mask_avx2.c b/aom_dsp/x86/blend_a64_mask_avx2.c
index d64fca8..67fb4d3 100644
--- a/aom_dsp/x86/blend_a64_mask_avx2.c
+++ b/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -20,6 +20,7 @@
 
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/blend_sse4.h"
 #include "aom_dsp/x86/blend_mask_sse4.h"
 
 #include "config/aom_dsp_rtcd.h"
@@ -399,3 +400,501 @@
     }
   }
 }
+
+static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+                                       const __m256i *v_m0_b,
+                                       const __m256i *v_m1_b,
+                                       const int32_t bits) {
+  const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0));
+  const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1));
+  const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8);
+  const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8);
+
+  const __m256i v_p0_w =
+      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b),
+                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
+  const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w);
+  const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8);
+  return v_res;
+}
+
+static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+                                       const __m256i *v_m0_b,
+                                       const __m256i *v_m1_b,
+                                       const int32_t bits) {
+  const __m256i v_s0_b = yy_loadu_256(src0);
+  const __m256i v_s1_b = yy_loadu_256(src1);
+
+  const __m256i v_p0_w =
+      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b),
+                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
+  const __m256i v_p1_w =
+      _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b),
+                           _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b));
+
+  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
+  const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits);
+  const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w);
+  return v_res;
+}
+
+static INLINE void blend_a64_mask_sx_sy_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h) {
+  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    const __m256i v_ral_b = yy_loadu_256(mask);
+    const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride);
+    const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
+    const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
+    const __m256i v_rvsbl_w =
+        _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
+    const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
+
+    const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2);
+    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w);
+    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
+                                             AOM_BLEND_A64_ROUND_BITS);
+
+    xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b));
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_sy_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_ral_b = yy_loadu_256(mask + 2 * c);
+      const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32);
+      const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c);
+      const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32);
+      const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
+      const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b);
+      const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
+      const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b);
+      const __m256i v_rvsbl_w =
+          _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
+      const __m256i v_rvsbh_w =
+          _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b);
+      const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
+      const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w);
+
+      const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2);
+      const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2);
+      const __m256i v_m0_b =
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_sy_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_ra_b = xx_loadl_64(mask);
+        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_ra_b = xx_loadu_128(mask);
+        const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, h);
+      break;
+    default:
+      blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, w, h);
+      break;
+  }
+}
+
+static INLINE void blend_a64_mask_sx_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i v_zmask_b = _mm256_set1_epi16(0xff);
+  do {
+    const __m256i v_rl_b = yy_loadu_256(mask);
+    const __m256i v_al_b =
+        _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1));
+
+    const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b);
+    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256());
+    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
+                                             AOM_BLEND_A64_ROUND_BITS);
+
+    xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b));
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle);
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_r0_b = yy_loadu_256(mask + 2 * c);
+      const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32);
+      const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b);
+      const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b);
+      const __m256i v_al_b =
+          _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8));
+      const __m256i v_ah_b =
+          _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8));
+
+      const __m256i v_m0_b =
+          _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_r_b = xx_loadl_64(mask);
+        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_r_b = xx_loadu_128(mask);
+        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h);
+      break;
+    default:
+      blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h);
+      break;
+  }
+}
+
+static INLINE void blend_a64_mask_sy_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h) {
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    const __m128i v_ra_b = xx_loadu_128(mask);
+    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+    const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storeu_128(dst, v_res_b);
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sy_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_ra_b = yy_loadu_256(mask + c);
+      const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride);
+      const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sy_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_ra_b = xx_loadl_32(mask);
+        const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+        const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_ra_b = xx_loadl_64(mask);
+        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+        const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h);
+      break;
+    default:
+      blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h);
+  }
+}
+
+static INLINE void blend_a64_mask_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_m0_b = yy_loadu_256(mask + c);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_m0_b = xx_loadl_32(mask);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_m0_b = xx_loadl_64(mask);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      do {
+        const __m128i v_m0_b = xx_loadu_128(mask);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storeu_128(dst, v_res_b);
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    default:
+      blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, w, h);
+  }
+}
+
+void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
+                             const uint8_t *src0, uint32_t src0_stride,
+                             const uint8_t *src1, uint32_t src1_stride,
+                             const uint8_t *mask, uint32_t mask_stride, int w,
+                             int h, int subx, int suby) {
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                         mask, mask_stride, w, h, subx, suby);
+  } else {
+    if (subx & suby) {
+      blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                src1_stride, mask, mask_stride, w, h);
+    } else if (subx) {
+      blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
+                             src1_stride, mask, mask_stride, w, h);
+    } else if (suby) {
+      blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
+                             src1_stride, mask, mask_stride, w, h);
+    } else {
+      blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                          mask, mask_stride, w, h);
+    }
+  }
+}
diff --git a/aom_dsp/x86/blend_a64_mask_sse4.c b/aom_dsp/x86/blend_a64_mask_sse4.c
index 55af9d8..9d6b4c2 100644
--- a/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -33,19 +33,13 @@
                                      const uint8_t *src1, uint32_t src1_stride,
                                      const uint8_t *mask, uint32_t mask_stride,
                                      int w, int h) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
   (void)w;
-
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   do {
     const __m128i v_m0_b = xx_loadl_32(mask);
-    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     xx_storel_32(dst, v_res_b);
 
     dst += dst_stride;
@@ -60,19 +54,13 @@
                                      const uint8_t *src1, uint32_t src1_stride,
                                      const uint8_t *mask, uint32_t mask_stride,
                                      int w, int h) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
   (void)w;
-
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   do {
     const __m128i v_m0_b = xx_loadl_64(mask);
-    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     xx_storel_64(dst, v_res_b);
 
     dst += dst_stride;
@@ -86,23 +74,17 @@
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
 
   do {
     int c;
     for (c = 0; c < w; c += 16) {
-      const __m128i v_m0l_b = xx_loadl_64(mask + c);
-      const __m128i v_m0h_b = xx_loadl_64(mask + c + 8);
-      const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b);
-      const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b);
-      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
-      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+      const __m128i v_m0_b = xx_loadu_128(mask + c);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
 
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
-      const __m128i v_resh_w =
-          blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
-
-      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
 
       xx_storeu_128(dst + c, v_res_b);
     }
@@ -121,23 +103,20 @@
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
   (void)w;
 
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   do {
     const __m128i v_r_b = xx_loadl_64(mask);
-    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+    const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+    const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
 
-    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     xx_storel_32(dst, v_res_b);
 
     dst += dst_stride;
@@ -151,22 +130,20 @@
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
   (void)w;
 
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   do {
     const __m128i v_r_b = xx_loadu_128(mask);
-    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+    const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+    const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
 
-    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
 
     xx_storel_64(dst, v_res_b);
 
@@ -181,28 +158,24 @@
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
 
   do {
     int c;
     for (c = 0; c < w; c += 16) {
-      const __m128i v_rl_b = xx_loadu_128(mask + 2 * c);
-      const __m128i v_rh_b = xx_loadu_128(mask + 2 * c + 16);
-      const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1));
-      const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1));
+      const __m128i v_r0_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16);
+      const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b);
+      const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b);
+      const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b);
+      const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b);
+      const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
 
-      const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b);
-      const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b);
-      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
-      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
-
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
-      const __m128i v_resh_w =
-          blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
-
-      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
 
       xx_storeu_128(dst + c, v_res_b);
     }
@@ -221,21 +194,18 @@
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
   (void)w;
 
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
   do {
     const __m128i v_ra_b = xx_loadl_32(mask);
     const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
-    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
 
-    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
 
     xx_storel_32(dst, v_res_b);
 
@@ -250,21 +220,16 @@
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
   (void)w;
 
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   do {
     const __m128i v_ra_b = xx_loadl_64(mask);
     const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
-    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-
-    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
 
     xx_storel_64(dst, v_res_b);
 
@@ -279,26 +244,18 @@
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_zero = _mm_setzero_si128();
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   do {
     int c;
     for (c = 0; c < w; c += 16) {
       const __m128i v_ra_b = xx_loadu_128(mask + c);
       const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
-      const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+      const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
 
-      const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b);
-      const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero);
-      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
-      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
-
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
-      const __m128i v_resh_w =
-          blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
-
-      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
 
       xx_storeu_128(dst + c, v_res_b);
     }
@@ -317,27 +274,24 @@
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   (void)w;
 
   do {
     const __m128i v_ra_b = xx_loadl_64(mask);
     const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
     const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
-    const __m128i v_rvsb_w =
-        _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
-    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
-
+    const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+    const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+    const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+    const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
     const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+    const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
 
-    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
 
     xx_storel_32(dst, v_res_b);
 
@@ -352,27 +306,25 @@
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   (void)w;
 
   do {
     const __m128i v_ra_b = xx_loadu_128(mask);
     const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+
     const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
-    const __m128i v_rvsb_w =
-        _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
-    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
-
+    const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+    const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+    const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+    const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
     const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+    const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
 
-    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
 
     xx_storel_64(dst, v_res_b);
 
@@ -389,8 +341,8 @@
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   do {
     int c;
     for (c = 0; c < w; c += 16) {
@@ -411,14 +363,11 @@
 
       const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
       const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
-      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
-      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+      const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
 
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
-      const __m128i v_resh_w =
-          blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
-
-      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
 
       xx_storeu_128(dst + c, v_res_b);
     }
diff --git a/aom_dsp/x86/blend_a64_vmask_sse4.c b/aom_dsp/x86/blend_a64_vmask_sse4.c
index 59506bd..0649102 100644
--- a/aom_dsp/x86/blend_a64_vmask_sse4.c
+++ b/aom_dsp/x86/blend_a64_vmask_sse4.c
@@ -39,7 +39,7 @@
     const __m128i v_m0_w = _mm_set1_epi16(*mask);
     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
 
-    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+    const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w);
 
     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
 
@@ -64,7 +64,7 @@
     const __m128i v_m0_w = _mm_set1_epi16(*mask);
     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
 
-    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+    const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w);
 
     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
 
@@ -90,9 +90,9 @@
     const __m128i v_m0_w = _mm_set1_epi16(*mask);
     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     for (c = 0; c < w; c += 16) {
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0_w, v_m1_w);
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w);
       const __m128i v_resh_w =
-          blend_8(src0 + c + 8, src1 + c + 8, v_m0_w, v_m1_w);
+          blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w);
 
       const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
 
diff --git a/aom_dsp/x86/blend_sse4.h b/aom_dsp/x86/blend_sse4.h
index 4880438..f7be7cf 100644
--- a/aom_dsp/x86/blend_sse4.h
+++ b/aom_dsp/x86/blend_sse4.h
@@ -14,20 +14,39 @@
 
 #include "aom_dsp/blend.h"
 #include "aom_dsp/x86/synonyms.h"
+static const uint8_t g_blend_a64_mask_shuffle[32] = {
+  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+};
 
 //////////////////////////////////////////////////////////////////////////////
 // Common kernels
 //////////////////////////////////////////////////////////////////////////////
 
 static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
-                              const __m128i v_m0_w, const __m128i v_m1_w) {
+                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
   const __m128i v_s0_b = xx_loadl_32(src0);
   const __m128i v_s1_b = xx_loadl_32(src1);
   const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
   const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
 
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
+                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
+  const __m128i v_s0_b = xx_loadl_64(src0);
+  const __m128i v_s1_b = xx_loadl_64(src1);
+  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
 
   const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
 
@@ -36,21 +55,49 @@
   return v_res_w;
 }
 
-static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
-                              const __m128i v_m0_w, const __m128i v_m1_w) {
+static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
+                                 const __m128i *v_m0_b, const __m128i *v_m1_b,
+                                 const __m128i *rounding) {
+  const __m128i v_s0_b = xx_loadl_32(src0);
+  const __m128i v_s1_b = xx_loadl_32(src1);
+
+  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
+  return v_res;
+}
+
+static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
+                                 const __m128i *v_m0_b, const __m128i *v_m1_b,
+                                 const __m128i *rounding) {
   const __m128i v_s0_b = xx_loadl_64(src0);
   const __m128i v_s1_b = xx_loadl_64(src1);
-  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
-  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
 
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
 
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
+  return v_res;
+}
 
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
+                                  const __m128i *v_m0_b, const __m128i *v_m1_b,
+                                  const __m128i *rounding) {
+  const __m128i v_s0_b = xx_loadu_128(src0);
+  const __m128i v_s1_b = xx_loadu_128(src1);
 
-  return v_res_w;
+  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+  const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
+
+  const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+  const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
+  const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
+  return v_res;
 }
 
 typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
diff --git a/aom_dsp/x86/synonyms_avx2.h b/aom_dsp/x86/synonyms_avx2.h
index 39f371f..b4458aa 100644
--- a/aom_dsp/x86/synonyms_avx2.h
+++ b/aom_dsp/x86/synonyms_avx2.h
@@ -61,4 +61,8 @@
   return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
 }
 
+static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
+  const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
+  return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
+}
 #endif  // AOM_DSP_X86_SYNONYMS_AVX2_H_
diff --git a/test/blend_a64_mask_test.cc b/test/blend_a64_mask_test.cc
index 4d4f070..66ca6fc 100644
--- a/test/blend_a64_mask_test.cc
+++ b/test/blend_a64_mask_test.cc
@@ -43,10 +43,16 @@
 
   virtual ~BlendA64MaskTest() {}
 
-  virtual void Execute(const SrcPixel *p_src0, const SrcPixel *p_src1) = 0;
+  virtual void Execute(const SrcPixel *p_src0, const SrcPixel *p_src1,
+                       int run_times) = 0;
 
   template <typename Pixel>
-  void GetSources(Pixel **src0, Pixel **src1, Pixel * /*dst*/) {
+  void GetSources(Pixel **src0, Pixel **src1, Pixel * /*dst*/, int run_times) {
+    if (run_times > 1) {
+      *src0 = src0_;
+      *src1 = src1_;
+      return;
+    }
     switch (this->rng_(3)) {
       case 0:  // Separate sources
         *src0 = src0_;
@@ -68,19 +74,20 @@
     }
   }
 
-  void GetSources(uint16_t **src0, uint16_t **src1, uint8_t * /*dst*/) {
+  void GetSources(uint16_t **src0, uint16_t **src1, uint8_t * /*dst*/,
+                  int /*run_times*/) {
     *src0 = src0_;
     *src1 = src1_;
   }
 
   uint8_t Rand1() { return this->rng_.Rand8() & 1; }
 
-  void RunTest() {
-    w_ = 4 << this->rng_(MAX_SB_SIZE_LOG2 - 1);
-    h_ = 4 << this->rng_(MAX_SB_SIZE_LOG2 - 1);
-
-    subx_ = Rand1();
-    suby_ = Rand1();
+  void RunOneTest(int block_size, int subx, int suby, int run_times) {
+    w_ = block_size_wide[block_size];
+    h_ = block_size_high[block_size];
+    run_times = run_times > 1 ? run_times / w_ : 1;
+    subx_ = subx;
+    suby_ = suby;
 
     dst_offset_ = this->rng_(33);
     dst_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
@@ -100,19 +107,26 @@
     p_src0 = src0_;
     p_src1 = src1_;
 
-    GetSources(&p_src0, &p_src1, &dst_ref_[0]);
+    GetSources(&p_src0, &p_src1, &dst_ref_[0], run_times);
 
-    Execute(p_src0, p_src1);
+    Execute(p_src0, p_src1, run_times);
 
     for (int r = 0; r < h_; ++r) {
       for (int c = 0; c < w_; ++c) {
         ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c],
                   dst_tst_[dst_offset_ + r * dst_stride_ + c])
-            << w_ << "x" << h_ << " r: " << r << " c: " << c;
+            << w_ << "x" << h_ << " subx " << subx_ << " suby " << suby_
+            << " r: " << r << " c: " << c;
       }
     }
   }
 
+  void RunTest(int block_size, int run_times) {
+    subx_ = Rand1();
+    suby_ = Rand1();
+    RunOneTest(block_size, subx_, suby_, run_times);
+  }
+
   DstPixel dst_ref_[kBufSize];
   DstPixel dst_tst_[kBufSize];
   uint32_t dst_stride_;
@@ -148,19 +162,37 @@
 
 class BlendA64MaskTest8B : public BlendA64MaskTest<F8B, uint8_t, uint8_t> {
  protected:
-  void Execute(const uint8_t *p_src0, const uint8_t *p_src1) {
-    params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
-                     src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_,
-                     kMaxMaskWidth, w_, h_, subx_, suby_);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(
-        dst_tst_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
-        src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, kMaxMaskWidth,
-        w_, h_, subx_, suby_));
+  void Execute(const uint8_t *p_src0, const uint8_t *p_src1, int run_times) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.ref_func(dst_ref_ + dst_offset_, dst_stride_,
+                       p_src0 + src0_offset_, src0_stride_,
+                       p_src1 + src1_offset_, src1_stride_, mask_,
+                       kMaxMaskWidth, w_, h_, subx_, suby_);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.tst_func(dst_tst_ + dst_offset_, dst_stride_,
+                       p_src0 + src0_offset_, src0_stride_,
+                       p_src1 + src1_offset_, src1_stride_, mask_,
+                       kMaxMaskWidth, w_, h_, subx_, suby_);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 1) {
+      printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+             time1, time2);
+      printf("(%3.2f)\n", time1 / time2);
+    }
   }
 };
 
 TEST_P(BlendA64MaskTest8B, RandomValues) {
   for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
     for (int i = 0; i < kBufSize; ++i) {
       dst_ref_[i] = rng_.Rand8();
       dst_tst_[i] = rng_.Rand8();
@@ -172,12 +204,13 @@
     for (int i = 0; i < kMaxMaskSize; ++i)
       mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
 
-    RunTest();
+    RunTest(bsize, 1);
   }
 }
 
 TEST_P(BlendA64MaskTest8B, ExtremeValues) {
   for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
     for (int i = 0; i < kBufSize; ++i) {
       dst_ref_[i] = rng_(2) + 254;
       dst_tst_[i] = rng_(2) + 254;
@@ -188,14 +221,39 @@
     for (int i = 0; i < kMaxMaskSize; ++i)
       mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
 
-    RunTest();
+    RunTest(bsize, 1);
   }
 }
+TEST_P(BlendA64MaskTest8B, DISABLED_Speed) {
+  const int kRunTimes = 10000000;
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_.Rand8();
+      dst_tst_[i] = rng_.Rand8();
 
+      src0_[i] = rng_.Rand8();
+      src1_[i] = rng_.Rand8();
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+    RunOneTest(bsize, 1, 1, kRunTimes);
+    RunOneTest(bsize, 1, 0, kRunTimes);
+    RunOneTest(bsize, 0, 1, kRunTimes);
+    RunOneTest(bsize, 0, 0, kRunTimes);
+  }
+}
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_CASE_P(SSE4_1, BlendA64MaskTest8B,
                         ::testing::Values(TestFuncs(
                             aom_blend_a64_mask_c, aom_blend_a64_mask_sse4_1)));
+#endif  // HAVE_AVX2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, BlendA64MaskTest8B,
+                        ::testing::Values(TestFuncs(aom_blend_a64_mask_sse4_1,
+                                                    aom_blend_a64_mask_avx2)));
 #endif  // HAVE_SSE4_1
 
 //////////////////////////////////////////////////////////////////////////////
@@ -215,22 +273,40 @@
   // max number of bits used by the source
   static const int kSrcMaxBitsMask = 0x3fff;
 
-  void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
     ConvolveParams conv_params;
     conv_params.round_0 = ROUND0_BITS;
     conv_params.round_1 = COMPOUND_ROUND1_BITS;
-    params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
-                     src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_,
-                     kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(
-        dst_tst_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
-        src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, kMaxMaskWidth,
-        w_, h_, subx_, suby_, &conv_params));
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.ref_func(dst_ref_ + dst_offset_, dst_stride_,
+                       p_src0 + src0_offset_, src0_stride_,
+                       p_src1 + src1_offset_, src1_stride_, mask_,
+                       kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.tst_func(dst_tst_ + dst_offset_, dst_stride_,
+                       p_src0 + src0_offset_, src0_stride_,
+                       p_src1 + src1_offset_, src1_stride_, mask_,
+                       kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 1) {
+      printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+             time1, time2);
+      printf("(%3.2f)\n", time1 / time2);
+    }
   }
 };
 
 TEST_P(BlendA64MaskTest8B_d16, RandomValues) {
   for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
     for (int i = 0; i < kBufSize; ++i) {
       dst_ref_[i] = rng_.Rand8();
       dst_tst_[i] = rng_.Rand8();
@@ -242,12 +318,13 @@
     for (int i = 0; i < kMaxMaskSize; ++i)
       mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
 
-    RunTest();
+    RunTest(bsize, 1);
   }
 }
 
 TEST_P(BlendA64MaskTest8B_d16, ExtremeValues) {
   for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
     for (int i = 0; i < kBufSize; ++i) {
       dst_ref_[i] = 255;
       dst_tst_[i] = 255;
@@ -259,7 +336,7 @@
     for (int i = 0; i < kMaxMaskSize; ++i)
       mask_[i] = AOM_BLEND_A64_MAX_ALPHA - 1;
 
-    RunTest();
+    RunTest(bsize, 1);
   }
 }
 
@@ -297,16 +374,31 @@
 
 class BlendA64MaskTestHBD : public BlendA64MaskTest<FHBD, uint16_t, uint16_t> {
  protected:
-  void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
-    params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
-                     CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
-                     CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
-                     mask_, kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(
-        CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
-        CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
-        CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, mask_,
-        kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_));
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+                       CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+                       CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+                       mask_, kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.tst_func(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
+                       CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+                       CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+                       mask_, kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 1) {
+      printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+             time1, time2);
+      printf("(%3.2f)\n", time1 / time2);
+    }
   }
 
   int bit_depth_;
@@ -314,6 +406,7 @@
 
 TEST_P(BlendA64MaskTestHBD, RandomValues) {
   for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
     switch (rng_(3)) {
       case 0: bit_depth_ = 8; break;
       case 1: bit_depth_ = 10; break;
@@ -332,12 +425,13 @@
     for (int i = 0; i < kMaxMaskSize; ++i)
       mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
 
-    RunTest();
+    RunTest(bsize, 1);
   }
 }
 
 TEST_P(BlendA64MaskTestHBD, ExtremeValues) {
   for (int iter = 0; iter < 1000 && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
     switch (rng_(3)) {
       case 0: bit_depth_ = 8; break;
       case 1: bit_depth_ = 10; break;
@@ -357,7 +451,7 @@
     for (int i = 0; i < kMaxMaskSize; ++i)
       mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
 
-    RunTest();
+    RunTest(bsize, 1);
   }
 }
 
@@ -387,21 +481,37 @@
   static const int kSrcMaxBitsMask = (1 << 14) - 1;
   static const int kSrcMaxBitsMaskHBD = (1 << 16) - 1;
 
-  void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
     ConvolveParams conv_params;
     conv_params.round_0 = (bit_depth_ == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
     conv_params.round_1 = COMPOUND_ROUND1_BITS;
-
-    params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
-                     p_src0 + src0_offset_, src0_stride_, p_src1 + src1_offset_,
-                     src1_stride_, mask_, kMaxMaskWidth, w_, h_, subx_, suby_,
-                     &conv_params, bit_depth_);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+                       p_src0 + src0_offset_, src0_stride_,
+                       p_src1 + src1_offset_, src1_stride_, mask_,
+                       kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params,
+                       bit_depth_);
+    }
     if (params_.tst_func) {
-      ASM_REGISTER_STATE_CHECK(params_.tst_func(
-          CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
-          p_src0 + src0_offset_, src0_stride_, p_src1 + src1_offset_,
-          src1_stride_, mask_, kMaxMaskWidth, w_, h_, subx_, suby_,
-          &conv_params, bit_depth_));
+      aom_usec_timer_mark(&timer);
+      const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+      aom_usec_timer_start(&timer);
+      for (int i = 0; i < run_times; ++i) {
+        params_.tst_func(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_),
+                         dst_stride_, p_src0 + src0_offset_, src0_stride_,
+                         p_src1 + src1_offset_, src1_stride_, mask_,
+                         kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params,
+                         bit_depth_);
+      }
+      aom_usec_timer_mark(&timer);
+      const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+      if (run_times > 1) {
+        printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+               time1, time2);
+        printf("(%3.2f)\n", time1 / time2);
+      }
     }
   }
 
@@ -412,6 +522,7 @@
 TEST_P(BlendA64MaskTestHBD_d16, RandomValues) {
   if (params_.tst_func == NULL) return;
   for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
     switch (rng_(3)) {
       case 0: bit_depth_ = 8; break;
       case 1: bit_depth_ = 10; break;
@@ -431,26 +542,28 @@
     for (int i = 0; i < kMaxMaskSize; ++i)
       mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
 
-    RunTest();
+    RunTest(bsize, 1);
   }
 }
+// TODO (Scott LaVarnway), fix this test
+TEST_P(BlendA64MaskTestHBD_d16, DISABLED_SaturatedValues) {
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
+      src_max_bits_mask_ =
+          (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD;
 
-TEST_P(BlendA64MaskTestHBD_d16, SaturatedValues) {
-  for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
-    src_max_bits_mask_ =
-        (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD;
+      for (int i = 0; i < kBufSize; ++i) {
+        dst_ref_[i] = 0;
+        dst_tst_[i] = (1 << bit_depth_) - 1;
 
-    for (int i = 0; i < kBufSize; ++i) {
-      dst_ref_[i] = 0;
-      dst_tst_[i] = (1 << bit_depth_) - 1;
+        src0_[i] = src_max_bits_mask_;
+        src1_[i] = src_max_bits_mask_;
+      }
 
-      src0_[i] = src_max_bits_mask_;
-      src1_[i] = src_max_bits_mask_;
+      for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = AOM_BLEND_A64_MAX_ALPHA;
+
+      RunTest(bsize, 1);
     }
-
-    for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = AOM_BLEND_A64_MAX_ALPHA;
-
-    RunTest();
   }
 }