Add missing AVX2 optimizations for variance functions | SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR | SSIM | ENC_T | MAX_ENC_T | | :-----: | :-----: | :------: | :------: | :-----: | :---: | :-------: | | 0 | av2_a3 | +0.000% | +0.000% | +0.000% | -4.0% | -3.4% | | 0 | av2_a4 | +0.000% | +0.000% | +0.000% | -4.9% | -3.6% | | 0 | av2_a5 | +0.000% | +0.000% | +0.000% | -3.5% | -2.9% |
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index 3423edb..a08d894 100755 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -568,51 +568,60 @@ } # - # Comp Avg + # Variance # add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance128x128 sse2/; + specialize qw/aom_highbd_12_variance128x128 sse2 avx2/; add_proto qw/unsigned int aom_highbd_12_variance128x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance128x64 sse2/; + specialize qw/aom_highbd_12_variance128x64 sse2 avx2/; add_proto qw/unsigned int aom_highbd_12_variance64x128/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance64x128 sse2/; + specialize qw/aom_highbd_12_variance64x128 sse2 avx2/; add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance64x64 sse2/; + specialize qw/aom_highbd_12_variance64x64 sse2 avx2/; add_proto qw/unsigned int aom_highbd_12_variance64x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance64x32 sse2/; + specialize qw/aom_highbd_12_variance64x32 sse2 avx2/; add_proto qw/unsigned int aom_highbd_12_variance32x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance32x64 sse2/; + specialize qw/aom_highbd_12_variance32x64 sse2 avx2/; add_proto qw/unsigned int aom_highbd_12_variance32x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance32x32 sse2/; + specialize qw/aom_highbd_12_variance32x32 sse2 avx2/; add_proto qw/unsigned int aom_highbd_12_variance32x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance32x16 sse2/; + specialize qw/aom_highbd_12_variance32x16 sse2 avx2/; add_proto qw/unsigned int aom_highbd_12_variance16x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance16x32 sse2/; + specialize qw/aom_highbd_12_variance16x32 sse2 avx2/; add_proto qw/unsigned int aom_highbd_12_variance16x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance16x16 sse2/; + specialize qw/aom_highbd_12_variance16x16 sse2 avx2/; add_proto qw/unsigned int aom_highbd_12_variance16x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance16x8 sse2/; + specialize qw/aom_highbd_12_variance16x8 sse2 avx2/; add_proto qw/unsigned int aom_highbd_12_variance8x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance8x16 sse2/; + specialize qw/aom_highbd_12_variance8x16 sse2 avx2/; add_proto qw/unsigned int aom_highbd_12_variance8x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance8x8 sse2/; + specialize qw/aom_highbd_12_variance8x8 sse2 avx2/; add_proto qw/unsigned int aom_highbd_12_variance8x4/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_12_variance8x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance8x32 sse2 avx2/; + add_proto qw/unsigned int aom_highbd_12_variance32x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance32x8 sse2 avx2/; + add_proto qw/unsigned int aom_highbd_12_variance16x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance16x64 sse2 avx2/; + add_proto qw/unsigned int aom_highbd_12_variance64x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance64x16 sse2 avx2/; + add_proto qw/unsigned int aom_highbd_10_variance128x128/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/aom_highbd_10_variance128x128 sse2 avx2/; @@ -656,49 +665,67 @@ add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_10_variance8x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance8x32 sse2 avx2/; + add_proto qw/unsigned int aom_highbd_10_variance32x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance32x8 sse2 avx2/; + add_proto qw/unsigned int aom_highbd_10_variance16x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance16x64 sse2 avx2/; + add_proto qw/unsigned int aom_highbd_10_variance64x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance64x16 sse2 avx2/; + add_proto qw/unsigned int aom_highbd_8_variance128x128/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance128x128 sse2/; + specialize qw/aom_highbd_8_variance128x128 sse2 avx2/; add_proto qw/unsigned int aom_highbd_8_variance128x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance128x64 sse2/; + specialize qw/aom_highbd_8_variance128x64 sse2 avx2/; add_proto qw/unsigned int aom_highbd_8_variance64x128/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance64x128 sse2/; + specialize qw/aom_highbd_8_variance64x128 sse2 avx2/; add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance64x64 sse2/; + specialize qw/aom_highbd_8_variance64x64 sse2 avx2/; add_proto qw/unsigned int aom_highbd_8_variance64x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance64x32 sse2/; + specialize qw/aom_highbd_8_variance64x32 sse2 avx2/; add_proto qw/unsigned int aom_highbd_8_variance32x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance32x64 sse2/; + specialize qw/aom_highbd_8_variance32x64 sse2 avx2/; add_proto qw/unsigned int aom_highbd_8_variance32x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance32x32 sse2/; + specialize qw/aom_highbd_8_variance32x32 sse2 avx2/; add_proto qw/unsigned int aom_highbd_8_variance32x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance32x16 sse2/; + specialize qw/aom_highbd_8_variance32x16 sse2 avx2/; add_proto qw/unsigned int aom_highbd_8_variance16x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance16x32 sse2/; + specialize qw/aom_highbd_8_variance16x32 sse2 avx2/; add_proto qw/unsigned int aom_highbd_8_variance16x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance16x16 sse2/; + specialize qw/aom_highbd_8_variance16x16 sse2 avx2/; add_proto qw/unsigned int aom_highbd_8_variance16x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance16x8 sse2/; + specialize qw/aom_highbd_8_variance16x8 sse2 avx2/; add_proto qw/unsigned int aom_highbd_8_variance8x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance8x16 sse2/; + specialize qw/aom_highbd_8_variance8x16 sse2 avx2/; add_proto qw/unsigned int aom_highbd_8_variance8x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance8x8 sse2/; + specialize qw/aom_highbd_8_variance8x8 sse2 avx2/; add_proto qw/unsigned int aom_highbd_8_variance8x4/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; add_proto qw/unsigned int aom_highbd_8_variance4x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; add_proto qw/unsigned int aom_highbd_8_variance4x4/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_8_variance8x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance8x32 sse2 avx2/; + add_proto qw/unsigned int aom_highbd_8_variance32x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance32x8 sse2 avx2/; + add_proto qw/unsigned int aom_highbd_8_variance16x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance16x64 sse2 avx2/; + add_proto qw/unsigned int aom_highbd_8_variance64x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance64x16 sse2 avx2/; + add_proto qw/void aom_highbd_8_get16x16var/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; add_proto qw/void aom_highbd_8_get8x8var/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; @@ -744,37 +771,47 @@ # Subpixel Variance # add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x128/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + # specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2 avx2/; specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2/; add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + # specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2 avx2/; specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2/; add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x128/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + # specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2 avx2/; specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2/; add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + # specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2 avx2/; specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/; add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + # specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2 avx2/; specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/; add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + # specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2 avx2/; specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/; add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + # specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2 avx2/; specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/; add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + # specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2 avx2/; specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/; add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + # specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2 avx2/; specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/; add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + # specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2 avx2/; specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/; add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/; + specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2 avx2/; add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/; @@ -788,6 +825,23 @@ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + # specialize qw/aom_highbd_12_sub_pixel_variance64x16 sse2 avx2/; + specialize qw/aom_highbd_12_sub_pixel_variance64x16 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + # specialize qw/aom_highbd_12_sub_pixel_variance16x64 sse2 avx2/; + specialize qw/aom_highbd_12_sub_pixel_variance16x64 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + # specialize qw/aom_highbd_12_sub_pixel_variance32x8 sse2 avx2/; + specialize qw/aom_highbd_12_sub_pixel_variance32x8 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance16x4 sse2 avx2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x128/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2 avx2/; @@ -822,10 +876,10 @@ specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2 avx2/; add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2 avx2/; + specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2/; add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2 avx2/; + specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2/; add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/; @@ -833,38 +887,52 @@ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance64x16 sse2 avx2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance16x64 sse2 avx2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance32x8 sse2 avx2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance16x4 sse2 avx2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x128/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2/; + specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2 avx2/; add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2/; + specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2 avx2/; add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x128/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2/; + specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2 avx2/; add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/; + specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2 avx2/; add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/; + specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2 avx2/; add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/; + specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2 avx2/; add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/; + specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2 avx2/; add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/; + specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2 avx2/; add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/; + specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2 avx2/; add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/; + specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2 avx2/; add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/; + specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2 avx2/; add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/; @@ -878,6 +946,23 @@ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance64x16 sse2 avx2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance16x64 sse2 avx2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance32x8 sse2 avx2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance16x4 sse2 avx2/; + + # + # Subpixel Avg Variance + # + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse, const uint16_t *second_pred"; specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/;
diff --git a/aom_dsp/x86/highbd_variance_avx2.c b/aom_dsp/x86/highbd_variance_avx2.c index 199c22a..008eb46 100644 --- a/aom_dsp/x86/highbd_variance_avx2.c +++ b/aom_dsp/x86/highbd_variance_avx2.c
@@ -21,11 +21,12 @@ const uint16_t *ref, int ref_stride, uint32_t *sse, int *sum); -static uint32_t aom_highbd_var_filter_block2d_bil_avx2( +// TODO(any): need to support 12-bit +static AOM_FORCE_INLINE void aom_highbd_var_filter_block2d_bil_avx2( const uint16_t *src_ptr, unsigned int src_pixels_per_line, int pixel_step, unsigned int output_height, unsigned int output_width, const uint32_t xoffset, const uint32_t yoffset, const uint16_t *dst_ptr, - int dst_stride, uint32_t *sse) { + int dst_stride, uint64_t *sse, int64_t *sum) { const __m256i filter1 = _mm256_set1_epi32((uint32_t)(bilinear_filters_2t[xoffset][1] << 16) | bilinear_filters_2t[xoffset][0]); @@ -40,7 +41,7 @@ uint16_t *dst_ptr_ref = (uint16_t *)dst_ptr; int64_t sum_long = 0; uint64_t sse_long = 0; - unsigned int rshift = 0, inc = 1; + unsigned int inc = 1; __m256i rbias = _mm256_set1_epi32(bitshift); __m256i opointer[8]; unsigned int range; @@ -82,9 +83,6 @@ sum_long += _mm_extract_epi32(v_d, 0); sse_long += _mm_extract_epi32(v_d, 1); } - - rshift = get_msb(output_height) + get_msb(output_width); - } else if (yoffset == 4) { // xoffset==0 && yoffset==4 range = output_width / 16; if (output_height == 8) inc = 2; @@ -131,9 +129,6 @@ sum_long += _mm_extract_epi32(v_d, 0); sse_long += _mm_extract_epi32(v_d, 1); } - - rshift = get_msb(output_height) + get_msb(output_width); - } else { // xoffset==0 && yoffset==1,2,3,5,6,7 range = output_width / 16; if (output_height == 8) inc = 2; @@ -195,8 +190,6 @@ sum_long += _mm_extract_epi32(v_d, 0); sse_long += _mm_extract_epi32(v_d, 1); } - - rshift = get_msb(output_height) + get_msb(output_width); } } else if (xoffset == 4) { if (yoffset == 0) { // xoffset==4 && yoffset==0 @@ -266,9 +259,6 @@ sum_long += _mm_extract_epi32(v_d, 0); sse_long += _mm_extract_epi32(v_d, 1); } - - rshift = get_msb(output_height) + get_msb(output_width); - } else if (yoffset == 4) { // xoffset==4 && yoffset==4 range = output_width / 16; if (output_height == 8) inc = 2; @@ -318,9 +308,6 @@ sum_long += _mm_extract_epi32(v_d, 0); sse_long += _mm_extract_epi32(v_d, 1); } - - rshift = get_msb(output_height) + get_msb(output_width); - } else { // xoffset==4 && yoffset==1,2,3,5,6,7 range = output_width / 16; if (output_height == 8) inc = 2; @@ -386,8 +373,6 @@ sum_long += _mm_extract_epi32(v_d, 0); sse_long += _mm_extract_epi32(v_d, 1); } - - rshift = get_msb(output_height) + get_msb(output_width); } } else if (yoffset == 0) { // xoffset==1,2,3,5,6,7 && yoffset==0 range = output_width / 16; @@ -440,9 +425,6 @@ sum_long += _mm_extract_epi32(v_d, 0); sse_long += _mm_extract_epi32(v_d, 1); } - - rshift = get_msb(output_height) + get_msb(output_width); - } else if (yoffset == 4) { // xoffset==1,2,3,5,6,7 && yoffset==4 range = output_width / 16; @@ -517,9 +499,6 @@ sum_long += _mm_extract_epi32(v_d, 0); sse_long += _mm_extract_epi32(v_d, 1); } - - rshift = get_msb(output_height) + get_msb(output_width); - } else { // xoffset==1,2,3,5,6,7 && yoffset==1,2,3,5,6,7 range = output_width / 16; if (output_height == 8) inc = 2; @@ -605,16 +584,10 @@ sum_long += _mm_extract_epi32(v_d, 0); sse_long += _mm_extract_epi32(v_d, 1); } - - rshift = get_msb(output_height) + get_msb(output_width); } - *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); - int sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); - - int32_t var = *sse - (uint32_t)(((int64_t)sum * sum) >> rshift); - - return (var > 0) ? var : 0; + *sse = sse_long; + *sum = sum_long; } void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, @@ -652,6 +625,8 @@ *sse = _mm_extract_epi32(v_d, 1); } +// TODO(any): Rewrite this function to make it work for 12-bit input +// Overflows for 12-bit inputs void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, uint32_t *sse, int *sum) { @@ -680,13 +655,13 @@ *sse = _mm_extract_epi32(v_d, 1); } -static void highbd_10_variance_avx2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, int w, - int h, uint32_t *sse, int *sum, - high_variance_fn_t var_fn, int block_size) { +static AOM_FORCE_INLINE void highbd_variance_avx2( + const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, + int w, int h, uint64_t *sse, int64_t *sum, high_variance_fn_t var_fn, + int block_size) { int i, j; uint64_t sse_long = 0; - int32_t sum_long = 0; + int64_t sum_long = 0; for (i = 0; i < h; i += block_size) { for (j = 0; j < w; j += block_size) { @@ -698,10 +673,67 @@ sum_long += sum0; } } - *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sum = sum_long; + *sse = sse_long; +} + +static AOM_INLINE void highbd_12_variance_avx2( + const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, high_variance_fn_t var_fn, + int block_size) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + + highbd_variance_avx2(src, src_stride, ref, ref_stride, w, h, &sse_long, + &sum_long, var_fn, block_size); + + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); +} + +static AOM_INLINE void highbd_10_variance_avx2( + const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, high_variance_fn_t var_fn, + int block_size) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + + highbd_variance_avx2(src, src_stride, ref, ref_stride, w, h, &sse_long, + &sum_long, var_fn, block_size); + + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); } +static AOM_INLINE void highbd_8_variance_avx2( + const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, high_variance_fn_t var_fn, + int block_size) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + + highbd_variance_avx2(src, src_stride, ref, ref_stride, w, h, &sse_long, + &sum_long, var_fn, block_size); + + *sum = (int)sum_long; + *sse = (uint32_t)sse_long; +} + +// The 12-bit function is separated out because aom_highbd_calc16x16var_avx2 +// currently cannot handle 12-bit inputs +#define VAR_FN_BD12(w, h, block_size, shift) \ + uint32_t aom_highbd_12_variance##w##x##h##_avx2( \ + const uint16_t *src, int src_stride, const uint16_t *ref, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_variance_avx2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + #define VAR_FN(w, h, block_size, shift) \ uint32_t aom_highbd_10_variance##w##x##h##_avx2( \ const uint16_t *src, int src_stride, const uint16_t *ref, \ @@ -713,6 +745,17 @@ aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ return (var >= 0) ? (uint32_t)var : 0; \ + } \ + uint32_t aom_highbd_8_variance##w##x##h##_avx2( \ + const uint16_t *src, int src_stride, const uint16_t *ref, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_8_variance_avx2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ } VAR_FN(128, 128, 16, 14); @@ -726,54 +769,125 @@ VAR_FN(16, 32, 16, 9); VAR_FN(16, 16, 16, 8); VAR_FN(16, 8, 8, 7); -VAR_FN(16, 4, 16, 6); +VAR_FN(8, 8, 8, 6); + VAR_FN(8, 32, 8, 8); VAR_FN(32, 8, 8, 8); VAR_FN(16, 64, 16, 10); VAR_FN(64, 16, 16, 10); VAR_FN(8, 16, 8, 7); -VAR_FN(8, 8, 8, 6); +VAR_FN_BD12(128, 128, 8, 14); +VAR_FN_BD12(128, 64, 8, 13); +VAR_FN_BD12(64, 128, 8, 13); +VAR_FN_BD12(64, 64, 8, 12); +VAR_FN_BD12(64, 32, 8, 11); +VAR_FN_BD12(32, 64, 8, 11); +VAR_FN_BD12(32, 32, 8, 10); +VAR_FN_BD12(32, 16, 8, 9); +VAR_FN_BD12(16, 32, 8, 9); +VAR_FN_BD12(16, 16, 8, 8); +VAR_FN_BD12(16, 8, 8, 7); +VAR_FN_BD12(8, 8, 8, 6); + +VAR_FN_BD12(8, 32, 8, 8); +VAR_FN_BD12(32, 8, 8, 8); +VAR_FN_BD12(16, 64, 8, 10); +VAR_FN_BD12(64, 16, 8, 10); +VAR_FN_BD12(8, 16, 8, 7); #undef VAR_FN +#undef VAR_FN_BD12 -#define SSE2_Height(H) \ - uint32_t aom_highbd_10_sub_pixel_variance8x##H##_sse2( \ - const uint16_t *src, int src_stride, int x_offset, int y_offset, \ - const uint16_t *dst, int dst_stride, uint32_t *sse_ptr); - -SSE2_Height(8); -SSE2_Height(16); -#undef SSE2_Height - -#define HIGHBD_SUBPIX_VAR(W, H) \ - uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2( \ - const uint16_t *src, int src_stride, int xoffset, int yoffset, \ - const uint16_t *dst, int dst_stride, uint32_t *sse) { \ - if (W == 8 && H == 16) \ - return aom_highbd_10_sub_pixel_variance8x16_sse2( \ - src, src_stride, xoffset, yoffset, dst, dst_stride, sse); \ - else if (W == 8 && H == 8) \ - return aom_highbd_10_sub_pixel_variance8x8_sse2( \ - src, src_stride, xoffset, yoffset, dst, dst_stride, sse); \ - else \ - return aom_highbd_var_filter_block2d_bil_avx2( \ - src, src_stride, 1, H, W, xoffset, yoffset, dst, dst_stride, sse); \ +// The 12-bit function is separated out because +// aom_highbd_var_filter_block2d_bil_avx2 overflows when bsize \geq 16X16 +#define HIGHBD_SUBPIX_VAR_BD12(W, H, rshift) \ + uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_avx2( \ + const uint16_t *src, int src_stride, int xoffset, int yoffset, \ + const uint16_t *dst, int dst_stride, uint32_t *sse) { \ + uint64_t sse_long = 0; \ + int64_t sum = 0; \ + \ + aom_highbd_var_filter_block2d_bil_avx2(src, src_stride, 1, H, W, xoffset, \ + yoffset, dst, dst_stride, \ + &sse_long, &sum); \ + \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + sum = ROUND_POWER_OF_TWO(sum, 4); \ + \ + int32_t var = *sse - (uint32_t)((sum * sum) >> rshift); \ + \ + return (var > 0) ? var : 0; \ } -HIGHBD_SUBPIX_VAR(128, 128); -HIGHBD_SUBPIX_VAR(128, 64); -HIGHBD_SUBPIX_VAR(64, 128); -HIGHBD_SUBPIX_VAR(64, 64); -HIGHBD_SUBPIX_VAR(64, 32); -HIGHBD_SUBPIX_VAR(32, 64); -HIGHBD_SUBPIX_VAR(32, 32); -HIGHBD_SUBPIX_VAR(32, 16); -HIGHBD_SUBPIX_VAR(16, 32); -HIGHBD_SUBPIX_VAR(16, 16); -HIGHBD_SUBPIX_VAR(16, 8); -HIGHBD_SUBPIX_VAR(8, 16); -HIGHBD_SUBPIX_VAR(8, 8); +#define HIGHBD_SUBPIX_VAR(W, H, rshift) \ + uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2( \ + const uint16_t *src, int src_stride, int xoffset, int yoffset, \ + const uint16_t *dst, int dst_stride, uint32_t *sse) { \ + uint64_t sse_long = 0; \ + int64_t sum = 0; \ + \ + aom_highbd_var_filter_block2d_bil_avx2(src, src_stride, 1, H, W, xoffset, \ + yoffset, dst, dst_stride, \ + &sse_long, &sum); \ + \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + sum = ROUND_POWER_OF_TWO(sum, 2); \ + \ + int32_t var = *sse - (uint32_t)((sum * sum) >> rshift); \ + \ + return (var > 0) ? var : 0; \ + } \ + uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_avx2( \ + const uint16_t *src, int src_stride, int xoffset, int yoffset, \ + const uint16_t *dst, int dst_stride, uint32_t *sse) { \ + uint64_t sse_long = 0; \ + int64_t sum = 0; \ + \ + aom_highbd_var_filter_block2d_bil_avx2(src, src_stride, 1, H, W, xoffset, \ + yoffset, dst, dst_stride, \ + &sse_long, &sum); \ + \ + *sse = (uint32_t)sse_long; \ + int32_t var = *sse - (uint32_t)((sum * sum) >> rshift); \ + \ + return (var > 0) ? var : 0; \ + } + +HIGHBD_SUBPIX_VAR(128, 128, 14); +HIGHBD_SUBPIX_VAR(128, 64, 13); +HIGHBD_SUBPIX_VAR(64, 128, 13); +HIGHBD_SUBPIX_VAR(64, 64, 12); +HIGHBD_SUBPIX_VAR(64, 32, 11); +HIGHBD_SUBPIX_VAR(32, 64, 11); +HIGHBD_SUBPIX_VAR(32, 32, 10); +HIGHBD_SUBPIX_VAR(32, 16, 9); +HIGHBD_SUBPIX_VAR(16, 32, 9); +HIGHBD_SUBPIX_VAR(16, 16, 8); +HIGHBD_SUBPIX_VAR(16, 8, 7); + +HIGHBD_SUBPIX_VAR(64, 16, 10); +HIGHBD_SUBPIX_VAR(16, 64, 10); +HIGHBD_SUBPIX_VAR(32, 8, 8); +HIGHBD_SUBPIX_VAR(16, 4, 6); + +// HIGHBD_SUBPIX_VAR_BD12(128, 128, 14); +// HIGHBD_SUBPIX_VAR_BD12(128, 64, 13); +// HIGHBD_SUBPIX_VAR_BD12(64, 128, 13); +// HIGHBD_SUBPIX_VAR_BD12(64, 64, 12); +// HIGHBD_SUBPIX_VAR_BD12(64, 32, 11); +// HIGHBD_SUBPIX_VAR_BD12(32, 64, 11); +// HIGHBD_SUBPIX_VAR_BD12(32, 32, 10); +// HIGHBD_SUBPIX_VAR_BD12(32, 16, 9); +// HIGHBD_SUBPIX_VAR_BD12(16, 32, 9); +// HIGHBD_SUBPIX_VAR_BD12(16, 16, 8); +HIGHBD_SUBPIX_VAR_BD12(16, 8, 7); + +// HIGHBD_SUBPIX_VAR_BD12(64, 16, 10); +// HIGHBD_SUBPIX_VAR_BD12(16, 64, 10); +// HIGHBD_SUBPIX_VAR_BD12(32, 8, 8); +HIGHBD_SUBPIX_VAR_BD12(16, 4, 6); #undef HIGHBD_SUBPIX_VAR +#undef HIGHBD_SUBPIX_VAR_BD12 uint64_t aom_mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride, uint16_t *src, int sstride, int h) {
diff --git a/test/variance_test.cc b/test/variance_test.cc index a706700..1c23d68 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc
@@ -407,7 +407,8 @@ aom_usec_timer_mark(&timer); const double elapsed_time = static_cast<double>(aom_usec_timer_elapsed(&timer)); - printf("Variance %dx%d : %7.2fns\n", width(), height(), elapsed_time); + printf("Bitdepth: %d, Variance %dx%d : %7.2fns\n", params_.bit_depth, width(), + height(), elapsed_time); } //////////////////////////////////////////////////////////////////////////////// @@ -1357,6 +1358,25 @@ 10))); const VarianceParams kArrayHBDVariance_avx2[] = { + VarianceParams(7, 7, &aom_highbd_8_variance128x128_avx2, 8), + VarianceParams(7, 6, &aom_highbd_8_variance128x64_avx2, 8), + VarianceParams(6, 7, &aom_highbd_8_variance64x128_avx2, 8), + VarianceParams(6, 6, &aom_highbd_8_variance64x64_avx2, 8), + VarianceParams(6, 5, &aom_highbd_8_variance64x32_avx2, 8), + VarianceParams(5, 6, &aom_highbd_8_variance32x64_avx2, 8), + VarianceParams(5, 5, &aom_highbd_8_variance32x32_avx2, 8), + VarianceParams(5, 4, &aom_highbd_8_variance32x16_avx2, 8), + VarianceParams(4, 5, &aom_highbd_8_variance16x32_avx2, 8), + VarianceParams(4, 4, &aom_highbd_8_variance16x16_avx2, 8), + VarianceParams(4, 3, &aom_highbd_8_variance16x8_avx2, 8), + VarianceParams(3, 4, &aom_highbd_8_variance8x16_avx2, 8), + VarianceParams(3, 3, &aom_highbd_8_variance8x8_avx2, 8), + + VarianceParams(3, 5, &aom_highbd_8_variance8x32_avx2, 8), + VarianceParams(5, 3, &aom_highbd_8_variance32x8_avx2, 8), + VarianceParams(4, 6, &aom_highbd_8_variance16x64_avx2, 8), + VarianceParams(6, 4, &aom_highbd_8_variance64x16_avx2, 8), + VarianceParams(7, 7, &aom_highbd_10_variance128x128_avx2, 10), VarianceParams(7, 6, &aom_highbd_10_variance128x64_avx2, 10), VarianceParams(6, 7, &aom_highbd_10_variance64x128_avx2, 10), @@ -1370,12 +1390,58 @@ VarianceParams(4, 3, &aom_highbd_10_variance16x8_avx2, 10), VarianceParams(3, 4, &aom_highbd_10_variance8x16_avx2, 10), VarianceParams(3, 3, &aom_highbd_10_variance8x8_avx2, 10), + + VarianceParams(3, 5, &aom_highbd_10_variance8x32_avx2, 10), + VarianceParams(5, 3, &aom_highbd_10_variance32x8_avx2, 10), + VarianceParams(4, 6, &aom_highbd_10_variance16x64_avx2, 10), + VarianceParams(6, 4, &aom_highbd_10_variance64x16_avx2, 10), + + VarianceParams(7, 7, &aom_highbd_12_variance128x128_avx2, 12), + VarianceParams(7, 6, &aom_highbd_12_variance128x64_avx2, 12), + VarianceParams(6, 7, &aom_highbd_12_variance64x128_avx2, 12), + VarianceParams(6, 6, &aom_highbd_12_variance64x64_avx2, 12), + VarianceParams(6, 5, &aom_highbd_12_variance64x32_avx2, 12), + VarianceParams(5, 6, &aom_highbd_12_variance32x64_avx2, 12), + VarianceParams(5, 5, &aom_highbd_12_variance32x32_avx2, 12), + VarianceParams(5, 4, &aom_highbd_12_variance32x16_avx2, 12), + VarianceParams(4, 5, &aom_highbd_12_variance16x32_avx2, 12), + VarianceParams(4, 4, &aom_highbd_12_variance16x16_avx2, 12), + VarianceParams(4, 3, &aom_highbd_12_variance16x8_avx2, 12), + VarianceParams(3, 4, &aom_highbd_12_variance8x16_avx2, 12), + VarianceParams(3, 3, &aom_highbd_12_variance8x8_avx2, 12), + + VarianceParams(3, 5, &aom_highbd_12_variance8x32_avx2, 12), + VarianceParams(5, 3, &aom_highbd_12_variance32x8_avx2, 12), + VarianceParams(4, 6, &aom_highbd_12_variance16x64_avx2, 12), + VarianceParams(6, 4, &aom_highbd_12_variance64x16_avx2, 12), }; INSTANTIATE_TEST_SUITE_P(AVX2, AvxHBDVarianceTest, ::testing::ValuesIn(kArrayHBDVariance_avx2)); const SubpelVarianceParams kArrayHBDSubpelVariance_avx2[] = { + // SubpelVarianceParams(7, 7, &aom_highbd_12_sub_pixel_variance128x128_avx2, + // 12), + // SubpelVarianceParams(7, 6, &aom_highbd_12_sub_pixel_variance128x64_avx2, + // 12), + // SubpelVarianceParams(6, 7, &aom_highbd_12_sub_pixel_variance64x128_avx2, + // 12), + // SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_avx2, + // 12), + // SubpelVarianceParams(6, 5, &aom_highbd_12_sub_pixel_variance64x32_avx2, + // 12), + // SubpelVarianceParams(5, 6, &aom_highbd_12_sub_pixel_variance32x64_avx2, + // 12), + // SubpelVarianceParams(5, 5, &aom_highbd_12_sub_pixel_variance32x32_avx2, + // 12), + // SubpelVarianceParams(5, 4, &aom_highbd_12_sub_pixel_variance32x16_avx2, + // 12), + // SubpelVarianceParams(4, 5, &aom_highbd_12_sub_pixel_variance16x32_avx2, + // 12), + // SubpelVarianceParams(4, 4, &aom_highbd_12_sub_pixel_variance16x16_avx2, + // 12), + SubpelVarianceParams(4, 3, &aom_highbd_12_sub_pixel_variance16x8_avx2, 12), + SubpelVarianceParams(7, 7, &aom_highbd_10_sub_pixel_variance128x128_avx2, 10), SubpelVarianceParams(7, 6, &aom_highbd_10_sub_pixel_variance128x64_avx2, 10), SubpelVarianceParams(6, 7, &aom_highbd_10_sub_pixel_variance64x128_avx2, 10), @@ -1387,8 +1453,35 @@ SubpelVarianceParams(4, 5, &aom_highbd_10_sub_pixel_variance16x32_avx2, 10), SubpelVarianceParams(4, 4, &aom_highbd_10_sub_pixel_variance16x16_avx2, 10), SubpelVarianceParams(4, 3, &aom_highbd_10_sub_pixel_variance16x8_avx2, 10), - SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_avx2, 10), - SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_avx2, 10), + + SubpelVarianceParams(7, 7, &aom_highbd_8_sub_pixel_variance128x128_avx2, 8), + SubpelVarianceParams(7, 6, &aom_highbd_8_sub_pixel_variance128x64_avx2, 8), + SubpelVarianceParams(6, 7, &aom_highbd_8_sub_pixel_variance64x128_avx2, 8), + SubpelVarianceParams(6, 6, &aom_highbd_8_sub_pixel_variance64x64_avx2, 8), + SubpelVarianceParams(6, 5, &aom_highbd_8_sub_pixel_variance64x32_avx2, 8), + SubpelVarianceParams(5, 6, &aom_highbd_8_sub_pixel_variance32x64_avx2, 8), + SubpelVarianceParams(5, 5, &aom_highbd_8_sub_pixel_variance32x32_avx2, 8), + SubpelVarianceParams(5, 4, &aom_highbd_8_sub_pixel_variance32x16_avx2, 8), + SubpelVarianceParams(4, 5, &aom_highbd_8_sub_pixel_variance16x32_avx2, 8), + SubpelVarianceParams(4, 4, &aom_highbd_8_sub_pixel_variance16x16_avx2, 8), + SubpelVarianceParams(4, 3, &aom_highbd_8_sub_pixel_variance16x8_avx2, 8), + + // SubpelVarianceParams(6, 4, &aom_highbd_12_sub_pixel_variance64x16_avx2, + // 12), + // SubpelVarianceParams(4, 6, &aom_highbd_12_sub_pixel_variance16x64_avx2, + // 12), + // SubpelVarianceParams(5, 3, &aom_highbd_12_sub_pixel_variance32x8_avx2, 12), + SubpelVarianceParams(4, 2, &aom_highbd_12_sub_pixel_variance16x4_avx2, 12), + + SubpelVarianceParams(6, 4, &aom_highbd_10_sub_pixel_variance64x16_avx2, 10), + SubpelVarianceParams(4, 6, &aom_highbd_10_sub_pixel_variance16x64_avx2, 10), + SubpelVarianceParams(5, 3, &aom_highbd_10_sub_pixel_variance32x8_avx2, 10), + SubpelVarianceParams(4, 2, &aom_highbd_10_sub_pixel_variance16x4_avx2, 10), + + SubpelVarianceParams(6, 4, &aom_highbd_8_sub_pixel_variance64x16_avx2, 8), + SubpelVarianceParams(4, 6, &aom_highbd_8_sub_pixel_variance16x64_avx2, 8), + SubpelVarianceParams(5, 3, &aom_highbd_8_sub_pixel_variance32x8_avx2, 8), + SubpelVarianceParams(4, 2, &aom_highbd_8_sub_pixel_variance16x4_avx2, 8), }; INSTANTIATE_TEST_SUITE_P(AVX2, AvxHBDSubpelVarianceTest,