Add missing AVX2 optimizations for variance functions
| SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR | SSIM | ENC_T | MAX_ENC_T |
| :-----: | :-----: | :------: | :------: | :-----: | :---: | :-------: |
| 0 | av2_a3 | +0.000% | +0.000% | +0.000% | -4.0% | -3.4% |
| 0 | av2_a4 | +0.000% | +0.000% | +0.000% | -4.9% | -3.6% |
| 0 | av2_a5 | +0.000% | +0.000% | +0.000% | -3.5% | -2.9% |
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 3423edb..a08d894 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -568,51 +568,60 @@
}
#
- # Comp Avg
+ # Variance
#
add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance128x128 sse2/;
+ specialize qw/aom_highbd_12_variance128x128 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_12_variance128x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance128x64 sse2/;
+ specialize qw/aom_highbd_12_variance128x64 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_12_variance64x128/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance64x128 sse2/;
+ specialize qw/aom_highbd_12_variance64x128 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance64x64 sse2/;
+ specialize qw/aom_highbd_12_variance64x64 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_12_variance64x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance64x32 sse2/;
+ specialize qw/aom_highbd_12_variance64x32 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_12_variance32x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance32x64 sse2/;
+ specialize qw/aom_highbd_12_variance32x64 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_12_variance32x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance32x32 sse2/;
+ specialize qw/aom_highbd_12_variance32x32 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_12_variance32x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance32x16 sse2/;
+ specialize qw/aom_highbd_12_variance32x16 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_12_variance16x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance16x32 sse2/;
+ specialize qw/aom_highbd_12_variance16x32 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_12_variance16x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance16x16 sse2/;
+ specialize qw/aom_highbd_12_variance16x16 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_12_variance16x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance16x8 sse2/;
+ specialize qw/aom_highbd_12_variance16x8 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_12_variance8x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance8x16 sse2/;
+ specialize qw/aom_highbd_12_variance8x16 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_12_variance8x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance8x8 sse2/;
+ specialize qw/aom_highbd_12_variance8x8 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_12_variance8x4/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_highbd_12_variance8x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_12_variance8x32 sse2 avx2/;
+ add_proto qw/unsigned int aom_highbd_12_variance32x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_12_variance32x8 sse2 avx2/;
+ add_proto qw/unsigned int aom_highbd_12_variance16x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_12_variance16x64 sse2 avx2/;
+ add_proto qw/unsigned int aom_highbd_12_variance64x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_12_variance64x16 sse2 avx2/;
+
add_proto qw/unsigned int aom_highbd_10_variance128x128/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/aom_highbd_10_variance128x128 sse2 avx2/;
@@ -656,49 +665,67 @@
add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_highbd_10_variance8x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_10_variance8x32 sse2 avx2/;
+ add_proto qw/unsigned int aom_highbd_10_variance32x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_10_variance32x8 sse2 avx2/;
+ add_proto qw/unsigned int aom_highbd_10_variance16x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_10_variance16x64 sse2 avx2/;
+ add_proto qw/unsigned int aom_highbd_10_variance64x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_10_variance64x16 sse2 avx2/;
+
add_proto qw/unsigned int aom_highbd_8_variance128x128/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance128x128 sse2/;
+ specialize qw/aom_highbd_8_variance128x128 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_8_variance128x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance128x64 sse2/;
+ specialize qw/aom_highbd_8_variance128x64 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_8_variance64x128/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance64x128 sse2/;
+ specialize qw/aom_highbd_8_variance64x128 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance64x64 sse2/;
+ specialize qw/aom_highbd_8_variance64x64 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_8_variance64x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance64x32 sse2/;
+ specialize qw/aom_highbd_8_variance64x32 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_8_variance32x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance32x64 sse2/;
+ specialize qw/aom_highbd_8_variance32x64 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_8_variance32x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance32x32 sse2/;
+ specialize qw/aom_highbd_8_variance32x32 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_8_variance32x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance32x16 sse2/;
+ specialize qw/aom_highbd_8_variance32x16 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_8_variance16x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance16x32 sse2/;
+ specialize qw/aom_highbd_8_variance16x32 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_8_variance16x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance16x16 sse2/;
+ specialize qw/aom_highbd_8_variance16x16 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_8_variance16x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance16x8 sse2/;
+ specialize qw/aom_highbd_8_variance16x8 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_8_variance8x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance8x16 sse2/;
+ specialize qw/aom_highbd_8_variance8x16 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_8_variance8x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance8x8 sse2/;
+ specialize qw/aom_highbd_8_variance8x8 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_8_variance8x4/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
add_proto qw/unsigned int aom_highbd_8_variance4x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
add_proto qw/unsigned int aom_highbd_8_variance4x4/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_highbd_8_variance8x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_8_variance8x32 sse2 avx2/;
+ add_proto qw/unsigned int aom_highbd_8_variance32x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_8_variance32x8 sse2 avx2/;
+ add_proto qw/unsigned int aom_highbd_8_variance16x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_8_variance16x64 sse2 avx2/;
+ add_proto qw/unsigned int aom_highbd_8_variance64x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_8_variance64x16 sse2 avx2/;
+
add_proto qw/void aom_highbd_8_get16x16var/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
add_proto qw/void aom_highbd_8_get8x8var/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
@@ -744,37 +771,47 @@
# Subpixel Variance
#
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x128/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ # specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2 avx2/;
specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2/;
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ # specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2 avx2/;
specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2/;
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x128/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ # specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2 avx2/;
specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2/;
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ # specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2 avx2/;
specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/;
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ # specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2 avx2/;
specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/;
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ # specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2 avx2/;
specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/;
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ # specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2 avx2/;
specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/;
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ # specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2 avx2/;
specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/;
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ # specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2 avx2/;
specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/;
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ # specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2 avx2/;
specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/;
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/;
+ specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2 avx2/;
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/;
@@ -788,6 +825,23 @@
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ # specialize qw/aom_highbd_12_sub_pixel_variance64x16 sse2 avx2/;
+ specialize qw/aom_highbd_12_sub_pixel_variance64x16 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ # specialize qw/aom_highbd_12_sub_pixel_variance16x64 sse2 avx2/;
+ specialize qw/aom_highbd_12_sub_pixel_variance16x64 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ # specialize qw/aom_highbd_12_sub_pixel_variance32x8 sse2 avx2/;
+ specialize qw/aom_highbd_12_sub_pixel_variance32x8 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_12_sub_pixel_variance16x4 sse2 avx2/;
+
+
add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x128/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2 avx2/;
@@ -822,10 +876,10 @@
specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2 avx2/;
add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2 avx2/;
+ specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2/;
add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2 avx2/;
+ specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2/;
add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/;
@@ -833,38 +887,52 @@
add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance64x16 sse2 avx2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance16x64 sse2 avx2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance32x8 sse2 avx2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance16x4 sse2 avx2/;
+
+
add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x128/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2/;
+ specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2 avx2/;
add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2/;
+ specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2 avx2/;
add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x128/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2/;
+ specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2 avx2/;
add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/;
+ specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2 avx2/;
add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/;
+ specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2 avx2/;
add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/;
+ specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2 avx2/;
add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/;
+ specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2 avx2/;
add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/;
+ specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2 avx2/;
add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/;
+ specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2 avx2/;
add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/;
+ specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2 avx2/;
add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/;
+ specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2 avx2/;
add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/;
@@ -878,6 +946,23 @@
add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance64x16 sse2 avx2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance16x64 sse2 avx2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance32x8 sse2 avx2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance16x4 sse2 avx2/;
+
+ #
+ # Subpixel Avg Variance
+ #
+
add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse, const uint16_t *second_pred";
specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/;
diff --git a/aom_dsp/x86/highbd_variance_avx2.c b/aom_dsp/x86/highbd_variance_avx2.c
index 199c22a..008eb46 100644
--- a/aom_dsp/x86/highbd_variance_avx2.c
+++ b/aom_dsp/x86/highbd_variance_avx2.c
@@ -21,11 +21,12 @@
const uint16_t *ref, int ref_stride,
uint32_t *sse, int *sum);
-static uint32_t aom_highbd_var_filter_block2d_bil_avx2(
+// TODO(any): need to support 12-bit
+static AOM_FORCE_INLINE void aom_highbd_var_filter_block2d_bil_avx2(
const uint16_t *src_ptr, unsigned int src_pixels_per_line, int pixel_step,
unsigned int output_height, unsigned int output_width,
const uint32_t xoffset, const uint32_t yoffset, const uint16_t *dst_ptr,
- int dst_stride, uint32_t *sse) {
+ int dst_stride, uint64_t *sse, int64_t *sum) {
const __m256i filter1 =
_mm256_set1_epi32((uint32_t)(bilinear_filters_2t[xoffset][1] << 16) |
bilinear_filters_2t[xoffset][0]);
@@ -40,7 +41,7 @@
uint16_t *dst_ptr_ref = (uint16_t *)dst_ptr;
int64_t sum_long = 0;
uint64_t sse_long = 0;
- unsigned int rshift = 0, inc = 1;
+ unsigned int inc = 1;
__m256i rbias = _mm256_set1_epi32(bitshift);
__m256i opointer[8];
unsigned int range;
@@ -82,9 +83,6 @@
sum_long += _mm_extract_epi32(v_d, 0);
sse_long += _mm_extract_epi32(v_d, 1);
}
-
- rshift = get_msb(output_height) + get_msb(output_width);
-
} else if (yoffset == 4) { // xoffset==0 && yoffset==4
range = output_width / 16;
if (output_height == 8) inc = 2;
@@ -131,9 +129,6 @@
sum_long += _mm_extract_epi32(v_d, 0);
sse_long += _mm_extract_epi32(v_d, 1);
}
-
- rshift = get_msb(output_height) + get_msb(output_width);
-
} else { // xoffset==0 && yoffset==1,2,3,5,6,7
range = output_width / 16;
if (output_height == 8) inc = 2;
@@ -195,8 +190,6 @@
sum_long += _mm_extract_epi32(v_d, 0);
sse_long += _mm_extract_epi32(v_d, 1);
}
-
- rshift = get_msb(output_height) + get_msb(output_width);
}
} else if (xoffset == 4) {
if (yoffset == 0) { // xoffset==4 && yoffset==0
@@ -266,9 +259,6 @@
sum_long += _mm_extract_epi32(v_d, 0);
sse_long += _mm_extract_epi32(v_d, 1);
}
-
- rshift = get_msb(output_height) + get_msb(output_width);
-
} else if (yoffset == 4) { // xoffset==4 && yoffset==4
range = output_width / 16;
if (output_height == 8) inc = 2;
@@ -318,9 +308,6 @@
sum_long += _mm_extract_epi32(v_d, 0);
sse_long += _mm_extract_epi32(v_d, 1);
}
-
- rshift = get_msb(output_height) + get_msb(output_width);
-
} else { // xoffset==4 && yoffset==1,2,3,5,6,7
range = output_width / 16;
if (output_height == 8) inc = 2;
@@ -386,8 +373,6 @@
sum_long += _mm_extract_epi32(v_d, 0);
sse_long += _mm_extract_epi32(v_d, 1);
}
-
- rshift = get_msb(output_height) + get_msb(output_width);
}
} else if (yoffset == 0) { // xoffset==1,2,3,5,6,7 && yoffset==0
range = output_width / 16;
@@ -440,9 +425,6 @@
sum_long += _mm_extract_epi32(v_d, 0);
sse_long += _mm_extract_epi32(v_d, 1);
}
-
- rshift = get_msb(output_height) + get_msb(output_width);
-
} else if (yoffset == 4) { // xoffset==1,2,3,5,6,7 && yoffset==4
range = output_width / 16;
@@ -517,9 +499,6 @@
sum_long += _mm_extract_epi32(v_d, 0);
sse_long += _mm_extract_epi32(v_d, 1);
}
-
- rshift = get_msb(output_height) + get_msb(output_width);
-
} else { // xoffset==1,2,3,5,6,7 && yoffset==1,2,3,5,6,7
range = output_width / 16;
if (output_height == 8) inc = 2;
@@ -605,16 +584,10 @@
sum_long += _mm_extract_epi32(v_d, 0);
sse_long += _mm_extract_epi32(v_d, 1);
}
-
- rshift = get_msb(output_height) + get_msb(output_width);
}
- *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
- int sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
-
- int32_t var = *sse - (uint32_t)(((int64_t)sum * sum) >> rshift);
-
- return (var > 0) ? var : 0;
+ *sse = sse_long;
+ *sum = sum_long;
}
void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
@@ -652,6 +625,8 @@
*sse = _mm_extract_epi32(v_d, 1);
}
+// TODO(any): Rewrite this function to make it work for 12-bit input
+// Overflows for 12-bit inputs
void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
uint32_t *sse, int *sum) {
@@ -680,13 +655,13 @@
*sse = _mm_extract_epi32(v_d, 1);
}
-static void highbd_10_variance_avx2(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride, int w,
- int h, uint32_t *sse, int *sum,
- high_variance_fn_t var_fn, int block_size) {
+static AOM_FORCE_INLINE void highbd_variance_avx2(
+ const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+ int w, int h, uint64_t *sse, int64_t *sum, high_variance_fn_t var_fn,
+ int block_size) {
int i, j;
uint64_t sse_long = 0;
- int32_t sum_long = 0;
+ int64_t sum_long = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
@@ -698,10 +673,67 @@
sum_long += sum0;
}
}
- *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+ *sum = sum_long;
+ *sse = sse_long;
+}
+
+static AOM_INLINE void highbd_12_variance_avx2(
+ const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum, high_variance_fn_t var_fn,
+ int block_size) {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+
+ highbd_variance_avx2(src, src_stride, ref, ref_stride, w, h, &sse_long,
+ &sum_long, var_fn, block_size);
+
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+static AOM_INLINE void highbd_10_variance_avx2(
+ const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum, high_variance_fn_t var_fn,
+ int block_size) {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+
+ highbd_variance_avx2(src, src_stride, ref, ref_stride, w, h, &sse_long,
+ &sum_long, var_fn, block_size);
+
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
*sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
}
+static AOM_INLINE void highbd_8_variance_avx2(
+ const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum, high_variance_fn_t var_fn,
+ int block_size) {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+
+ highbd_variance_avx2(src, src_stride, ref, ref_stride, w, h, &sse_long,
+ &sum_long, var_fn, block_size);
+
+ *sum = (int)sum_long;
+ *sse = (uint32_t)sse_long;
+}
+
+// The 12-bit function is separated out because aom_highbd_calc16x16var_avx2
+// currently cannot handle 12-bit inputs
+#define VAR_FN_BD12(w, h, block_size, shift) \
+ uint32_t aom_highbd_12_variance##w##x##h##_avx2( \
+ const uint16_t *src, int src_stride, const uint16_t *ref, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_12_variance_avx2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
#define VAR_FN(w, h, block_size, shift) \
uint32_t aom_highbd_10_variance##w##x##h##_avx2( \
const uint16_t *src, int src_stride, const uint16_t *ref, \
@@ -713,6 +745,17 @@
aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \
var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ uint32_t aom_highbd_8_variance##w##x##h##_avx2( \
+ const uint16_t *src, int src_stride, const uint16_t *ref, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_8_variance_avx2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
+ return (var >= 0) ? (uint32_t)var : 0; \
}
VAR_FN(128, 128, 16, 14);
@@ -726,54 +769,125 @@
VAR_FN(16, 32, 16, 9);
VAR_FN(16, 16, 16, 8);
VAR_FN(16, 8, 8, 7);
-VAR_FN(16, 4, 16, 6);
+VAR_FN(8, 8, 8, 6);
+
VAR_FN(8, 32, 8, 8);
VAR_FN(32, 8, 8, 8);
VAR_FN(16, 64, 16, 10);
VAR_FN(64, 16, 16, 10);
VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
+VAR_FN_BD12(128, 128, 8, 14);
+VAR_FN_BD12(128, 64, 8, 13);
+VAR_FN_BD12(64, 128, 8, 13);
+VAR_FN_BD12(64, 64, 8, 12);
+VAR_FN_BD12(64, 32, 8, 11);
+VAR_FN_BD12(32, 64, 8, 11);
+VAR_FN_BD12(32, 32, 8, 10);
+VAR_FN_BD12(32, 16, 8, 9);
+VAR_FN_BD12(16, 32, 8, 9);
+VAR_FN_BD12(16, 16, 8, 8);
+VAR_FN_BD12(16, 8, 8, 7);
+VAR_FN_BD12(8, 8, 8, 6);
+
+VAR_FN_BD12(8, 32, 8, 8);
+VAR_FN_BD12(32, 8, 8, 8);
+VAR_FN_BD12(16, 64, 8, 10);
+VAR_FN_BD12(64, 16, 8, 10);
+VAR_FN_BD12(8, 16, 8, 7);
#undef VAR_FN
+#undef VAR_FN_BD12
-#define SSE2_Height(H) \
- uint32_t aom_highbd_10_sub_pixel_variance8x##H##_sse2( \
- const uint16_t *src, int src_stride, int x_offset, int y_offset, \
- const uint16_t *dst, int dst_stride, uint32_t *sse_ptr);
-
-SSE2_Height(8);
-SSE2_Height(16);
-#undef SSE2_Height
-
-#define HIGHBD_SUBPIX_VAR(W, H) \
- uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2( \
- const uint16_t *src, int src_stride, int xoffset, int yoffset, \
- const uint16_t *dst, int dst_stride, uint32_t *sse) { \
- if (W == 8 && H == 16) \
- return aom_highbd_10_sub_pixel_variance8x16_sse2( \
- src, src_stride, xoffset, yoffset, dst, dst_stride, sse); \
- else if (W == 8 && H == 8) \
- return aom_highbd_10_sub_pixel_variance8x8_sse2( \
- src, src_stride, xoffset, yoffset, dst, dst_stride, sse); \
- else \
- return aom_highbd_var_filter_block2d_bil_avx2( \
- src, src_stride, 1, H, W, xoffset, yoffset, dst, dst_stride, sse); \
+// The 12-bit function is separated out because
+// aom_highbd_var_filter_block2d_bil_avx2 overflows when bsize \geq 16X16
+#define HIGHBD_SUBPIX_VAR_BD12(W, H, rshift) \
+ uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_avx2( \
+ const uint16_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint16_t *dst, int dst_stride, uint32_t *sse) { \
+ uint64_t sse_long = 0; \
+ int64_t sum = 0; \
+ \
+ aom_highbd_var_filter_block2d_bil_avx2(src, src_stride, 1, H, W, xoffset, \
+ yoffset, dst, dst_stride, \
+ &sse_long, &sum); \
+ \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \
+ sum = ROUND_POWER_OF_TWO(sum, 4); \
+ \
+ int32_t var = *sse - (uint32_t)((sum * sum) >> rshift); \
+ \
+ return (var > 0) ? var : 0; \
}
-HIGHBD_SUBPIX_VAR(128, 128);
-HIGHBD_SUBPIX_VAR(128, 64);
-HIGHBD_SUBPIX_VAR(64, 128);
-HIGHBD_SUBPIX_VAR(64, 64);
-HIGHBD_SUBPIX_VAR(64, 32);
-HIGHBD_SUBPIX_VAR(32, 64);
-HIGHBD_SUBPIX_VAR(32, 32);
-HIGHBD_SUBPIX_VAR(32, 16);
-HIGHBD_SUBPIX_VAR(16, 32);
-HIGHBD_SUBPIX_VAR(16, 16);
-HIGHBD_SUBPIX_VAR(16, 8);
-HIGHBD_SUBPIX_VAR(8, 16);
-HIGHBD_SUBPIX_VAR(8, 8);
+#define HIGHBD_SUBPIX_VAR(W, H, rshift) \
+ uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2( \
+ const uint16_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint16_t *dst, int dst_stride, uint32_t *sse) { \
+ uint64_t sse_long = 0; \
+ int64_t sum = 0; \
+ \
+ aom_highbd_var_filter_block2d_bil_avx2(src, src_stride, 1, H, W, xoffset, \
+ yoffset, dst, dst_stride, \
+ &sse_long, &sum); \
+ \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \
+ sum = ROUND_POWER_OF_TWO(sum, 2); \
+ \
+ int32_t var = *sse - (uint32_t)((sum * sum) >> rshift); \
+ \
+ return (var > 0) ? var : 0; \
+ } \
+ uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_avx2( \
+ const uint16_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint16_t *dst, int dst_stride, uint32_t *sse) { \
+ uint64_t sse_long = 0; \
+ int64_t sum = 0; \
+ \
+ aom_highbd_var_filter_block2d_bil_avx2(src, src_stride, 1, H, W, xoffset, \
+ yoffset, dst, dst_stride, \
+ &sse_long, &sum); \
+ \
+ *sse = (uint32_t)sse_long; \
+ int32_t var = *sse - (uint32_t)((sum * sum) >> rshift); \
+ \
+ return (var > 0) ? var : 0; \
+ }
+
+HIGHBD_SUBPIX_VAR(128, 128, 14);
+HIGHBD_SUBPIX_VAR(128, 64, 13);
+HIGHBD_SUBPIX_VAR(64, 128, 13);
+HIGHBD_SUBPIX_VAR(64, 64, 12);
+HIGHBD_SUBPIX_VAR(64, 32, 11);
+HIGHBD_SUBPIX_VAR(32, 64, 11);
+HIGHBD_SUBPIX_VAR(32, 32, 10);
+HIGHBD_SUBPIX_VAR(32, 16, 9);
+HIGHBD_SUBPIX_VAR(16, 32, 9);
+HIGHBD_SUBPIX_VAR(16, 16, 8);
+HIGHBD_SUBPIX_VAR(16, 8, 7);
+
+HIGHBD_SUBPIX_VAR(64, 16, 10);
+HIGHBD_SUBPIX_VAR(16, 64, 10);
+HIGHBD_SUBPIX_VAR(32, 8, 8);
+HIGHBD_SUBPIX_VAR(16, 4, 6);
+
+// HIGHBD_SUBPIX_VAR_BD12(128, 128, 14);
+// HIGHBD_SUBPIX_VAR_BD12(128, 64, 13);
+// HIGHBD_SUBPIX_VAR_BD12(64, 128, 13);
+// HIGHBD_SUBPIX_VAR_BD12(64, 64, 12);
+// HIGHBD_SUBPIX_VAR_BD12(64, 32, 11);
+// HIGHBD_SUBPIX_VAR_BD12(32, 64, 11);
+// HIGHBD_SUBPIX_VAR_BD12(32, 32, 10);
+// HIGHBD_SUBPIX_VAR_BD12(32, 16, 9);
+// HIGHBD_SUBPIX_VAR_BD12(16, 32, 9);
+// HIGHBD_SUBPIX_VAR_BD12(16, 16, 8);
+HIGHBD_SUBPIX_VAR_BD12(16, 8, 7);
+
+// HIGHBD_SUBPIX_VAR_BD12(64, 16, 10);
+// HIGHBD_SUBPIX_VAR_BD12(16, 64, 10);
+// HIGHBD_SUBPIX_VAR_BD12(32, 8, 8);
+HIGHBD_SUBPIX_VAR_BD12(16, 4, 6);
#undef HIGHBD_SUBPIX_VAR
+#undef HIGHBD_SUBPIX_VAR_BD12
uint64_t aom_mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
uint16_t *src, int sstride, int h) {
diff --git a/test/variance_test.cc b/test/variance_test.cc
index a706700..1c23d68 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -407,7 +407,8 @@
aom_usec_timer_mark(&timer);
const double elapsed_time =
static_cast<double>(aom_usec_timer_elapsed(&timer));
- printf("Variance %dx%d : %7.2fns\n", width(), height(), elapsed_time);
+ printf("Bitdepth: %d, Variance %dx%d : %7.2fns\n", params_.bit_depth, width(),
+ height(), elapsed_time);
}
////////////////////////////////////////////////////////////////////////////////
@@ -1357,6 +1358,25 @@
10)));
const VarianceParams kArrayHBDVariance_avx2[] = {
+ VarianceParams(7, 7, &aom_highbd_8_variance128x128_avx2, 8),
+ VarianceParams(7, 6, &aom_highbd_8_variance128x64_avx2, 8),
+ VarianceParams(6, 7, &aom_highbd_8_variance64x128_avx2, 8),
+ VarianceParams(6, 6, &aom_highbd_8_variance64x64_avx2, 8),
+ VarianceParams(6, 5, &aom_highbd_8_variance64x32_avx2, 8),
+ VarianceParams(5, 6, &aom_highbd_8_variance32x64_avx2, 8),
+ VarianceParams(5, 5, &aom_highbd_8_variance32x32_avx2, 8),
+ VarianceParams(5, 4, &aom_highbd_8_variance32x16_avx2, 8),
+ VarianceParams(4, 5, &aom_highbd_8_variance16x32_avx2, 8),
+ VarianceParams(4, 4, &aom_highbd_8_variance16x16_avx2, 8),
+ VarianceParams(4, 3, &aom_highbd_8_variance16x8_avx2, 8),
+ VarianceParams(3, 4, &aom_highbd_8_variance8x16_avx2, 8),
+ VarianceParams(3, 3, &aom_highbd_8_variance8x8_avx2, 8),
+
+ VarianceParams(3, 5, &aom_highbd_8_variance8x32_avx2, 8),
+ VarianceParams(5, 3, &aom_highbd_8_variance32x8_avx2, 8),
+ VarianceParams(4, 6, &aom_highbd_8_variance16x64_avx2, 8),
+ VarianceParams(6, 4, &aom_highbd_8_variance64x16_avx2, 8),
+
VarianceParams(7, 7, &aom_highbd_10_variance128x128_avx2, 10),
VarianceParams(7, 6, &aom_highbd_10_variance128x64_avx2, 10),
VarianceParams(6, 7, &aom_highbd_10_variance64x128_avx2, 10),
@@ -1370,12 +1390,58 @@
VarianceParams(4, 3, &aom_highbd_10_variance16x8_avx2, 10),
VarianceParams(3, 4, &aom_highbd_10_variance8x16_avx2, 10),
VarianceParams(3, 3, &aom_highbd_10_variance8x8_avx2, 10),
+
+ VarianceParams(3, 5, &aom_highbd_10_variance8x32_avx2, 10),
+ VarianceParams(5, 3, &aom_highbd_10_variance32x8_avx2, 10),
+ VarianceParams(4, 6, &aom_highbd_10_variance16x64_avx2, 10),
+ VarianceParams(6, 4, &aom_highbd_10_variance64x16_avx2, 10),
+
+ VarianceParams(7, 7, &aom_highbd_12_variance128x128_avx2, 12),
+ VarianceParams(7, 6, &aom_highbd_12_variance128x64_avx2, 12),
+ VarianceParams(6, 7, &aom_highbd_12_variance64x128_avx2, 12),
+ VarianceParams(6, 6, &aom_highbd_12_variance64x64_avx2, 12),
+ VarianceParams(6, 5, &aom_highbd_12_variance64x32_avx2, 12),
+ VarianceParams(5, 6, &aom_highbd_12_variance32x64_avx2, 12),
+ VarianceParams(5, 5, &aom_highbd_12_variance32x32_avx2, 12),
+ VarianceParams(5, 4, &aom_highbd_12_variance32x16_avx2, 12),
+ VarianceParams(4, 5, &aom_highbd_12_variance16x32_avx2, 12),
+ VarianceParams(4, 4, &aom_highbd_12_variance16x16_avx2, 12),
+ VarianceParams(4, 3, &aom_highbd_12_variance16x8_avx2, 12),
+ VarianceParams(3, 4, &aom_highbd_12_variance8x16_avx2, 12),
+ VarianceParams(3, 3, &aom_highbd_12_variance8x8_avx2, 12),
+
+ VarianceParams(3, 5, &aom_highbd_12_variance8x32_avx2, 12),
+ VarianceParams(5, 3, &aom_highbd_12_variance32x8_avx2, 12),
+ VarianceParams(4, 6, &aom_highbd_12_variance16x64_avx2, 12),
+ VarianceParams(6, 4, &aom_highbd_12_variance64x16_avx2, 12),
};
INSTANTIATE_TEST_SUITE_P(AVX2, AvxHBDVarianceTest,
::testing::ValuesIn(kArrayHBDVariance_avx2));
const SubpelVarianceParams kArrayHBDSubpelVariance_avx2[] = {
+ // SubpelVarianceParams(7, 7, &aom_highbd_12_sub_pixel_variance128x128_avx2,
+ // 12),
+ // SubpelVarianceParams(7, 6, &aom_highbd_12_sub_pixel_variance128x64_avx2,
+ // 12),
+ // SubpelVarianceParams(6, 7, &aom_highbd_12_sub_pixel_variance64x128_avx2,
+ // 12),
+ // SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_avx2,
+ // 12),
+ // SubpelVarianceParams(6, 5, &aom_highbd_12_sub_pixel_variance64x32_avx2,
+ // 12),
+ // SubpelVarianceParams(5, 6, &aom_highbd_12_sub_pixel_variance32x64_avx2,
+ // 12),
+ // SubpelVarianceParams(5, 5, &aom_highbd_12_sub_pixel_variance32x32_avx2,
+ // 12),
+ // SubpelVarianceParams(5, 4, &aom_highbd_12_sub_pixel_variance32x16_avx2,
+ // 12),
+ // SubpelVarianceParams(4, 5, &aom_highbd_12_sub_pixel_variance16x32_avx2,
+ // 12),
+ // SubpelVarianceParams(4, 4, &aom_highbd_12_sub_pixel_variance16x16_avx2,
+ // 12),
+ SubpelVarianceParams(4, 3, &aom_highbd_12_sub_pixel_variance16x8_avx2, 12),
+
SubpelVarianceParams(7, 7, &aom_highbd_10_sub_pixel_variance128x128_avx2, 10),
SubpelVarianceParams(7, 6, &aom_highbd_10_sub_pixel_variance128x64_avx2, 10),
SubpelVarianceParams(6, 7, &aom_highbd_10_sub_pixel_variance64x128_avx2, 10),
@@ -1387,8 +1453,35 @@
SubpelVarianceParams(4, 5, &aom_highbd_10_sub_pixel_variance16x32_avx2, 10),
SubpelVarianceParams(4, 4, &aom_highbd_10_sub_pixel_variance16x16_avx2, 10),
SubpelVarianceParams(4, 3, &aom_highbd_10_sub_pixel_variance16x8_avx2, 10),
- SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_avx2, 10),
- SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_avx2, 10),
+
+ SubpelVarianceParams(7, 7, &aom_highbd_8_sub_pixel_variance128x128_avx2, 8),
+ SubpelVarianceParams(7, 6, &aom_highbd_8_sub_pixel_variance128x64_avx2, 8),
+ SubpelVarianceParams(6, 7, &aom_highbd_8_sub_pixel_variance64x128_avx2, 8),
+ SubpelVarianceParams(6, 6, &aom_highbd_8_sub_pixel_variance64x64_avx2, 8),
+ SubpelVarianceParams(6, 5, &aom_highbd_8_sub_pixel_variance64x32_avx2, 8),
+ SubpelVarianceParams(5, 6, &aom_highbd_8_sub_pixel_variance32x64_avx2, 8),
+ SubpelVarianceParams(5, 5, &aom_highbd_8_sub_pixel_variance32x32_avx2, 8),
+ SubpelVarianceParams(5, 4, &aom_highbd_8_sub_pixel_variance32x16_avx2, 8),
+ SubpelVarianceParams(4, 5, &aom_highbd_8_sub_pixel_variance16x32_avx2, 8),
+ SubpelVarianceParams(4, 4, &aom_highbd_8_sub_pixel_variance16x16_avx2, 8),
+ SubpelVarianceParams(4, 3, &aom_highbd_8_sub_pixel_variance16x8_avx2, 8),
+
+ // SubpelVarianceParams(6, 4, &aom_highbd_12_sub_pixel_variance64x16_avx2,
+ // 12),
+ // SubpelVarianceParams(4, 6, &aom_highbd_12_sub_pixel_variance16x64_avx2,
+ // 12),
+ // SubpelVarianceParams(5, 3, &aom_highbd_12_sub_pixel_variance32x8_avx2, 12),
+ SubpelVarianceParams(4, 2, &aom_highbd_12_sub_pixel_variance16x4_avx2, 12),
+
+ SubpelVarianceParams(6, 4, &aom_highbd_10_sub_pixel_variance64x16_avx2, 10),
+ SubpelVarianceParams(4, 6, &aom_highbd_10_sub_pixel_variance16x64_avx2, 10),
+ SubpelVarianceParams(5, 3, &aom_highbd_10_sub_pixel_variance32x8_avx2, 10),
+ SubpelVarianceParams(4, 2, &aom_highbd_10_sub_pixel_variance16x4_avx2, 10),
+
+ SubpelVarianceParams(6, 4, &aom_highbd_8_sub_pixel_variance64x16_avx2, 8),
+ SubpelVarianceParams(4, 6, &aom_highbd_8_sub_pixel_variance16x64_avx2, 8),
+ SubpelVarianceParams(5, 3, &aom_highbd_8_sub_pixel_variance32x8_avx2, 8),
+ SubpelVarianceParams(4, 2, &aom_highbd_8_sub_pixel_variance16x4_avx2, 8),
};
INSTANTIATE_TEST_SUITE_P(AVX2, AvxHBDSubpelVarianceTest,