Add missing AVX2 optimizations for variance functions

| SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR |  SSIM   | ENC_T | MAX_ENC_T |
| :-----: | :-----: | :------: | :------: | :-----: | :---: | :-------: |
|    0    | av2_a3  | +0.000%  | +0.000%  | +0.000% | -4.0% |   -3.4%   |
|    0    | av2_a4  | +0.000%  | +0.000%  | +0.000% | -4.9% |   -3.6%   |
|    0    | av2_a5  | +0.000%  | +0.000%  | +0.000% | -3.5% |   -2.9%   |
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 3423edb..a08d894 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -568,51 +568,60 @@
   }
 
   #
-  # Comp Avg
+  # Variance
   #
   add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_12_variance128x128 sse2/;
+  specialize qw/aom_highbd_12_variance128x128 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_12_variance128x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_12_variance128x64 sse2/;
+  specialize qw/aom_highbd_12_variance128x64 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_12_variance64x128/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_12_variance64x128 sse2/;
+  specialize qw/aom_highbd_12_variance64x128 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_12_variance64x64 sse2/;
+  specialize qw/aom_highbd_12_variance64x64 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_12_variance64x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_12_variance64x32 sse2/;
+  specialize qw/aom_highbd_12_variance64x32 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_12_variance32x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_12_variance32x64 sse2/;
+  specialize qw/aom_highbd_12_variance32x64 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_12_variance32x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_12_variance32x32 sse2/;
+  specialize qw/aom_highbd_12_variance32x32 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_12_variance32x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_12_variance32x16 sse2/;
+  specialize qw/aom_highbd_12_variance32x16 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_12_variance16x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_12_variance16x32 sse2/;
+  specialize qw/aom_highbd_12_variance16x32 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_12_variance16x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_12_variance16x16 sse2/;
+  specialize qw/aom_highbd_12_variance16x16 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_12_variance16x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_12_variance16x8 sse2/;
+  specialize qw/aom_highbd_12_variance16x8 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_12_variance8x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_12_variance8x16 sse2/;
+  specialize qw/aom_highbd_12_variance8x16 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_12_variance8x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_12_variance8x8 sse2/;
+  specialize qw/aom_highbd_12_variance8x8 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_12_variance8x4/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
   add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
   add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
 
+  add_proto qw/unsigned int aom_highbd_12_variance8x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/aom_highbd_12_variance8x32 sse2 avx2/;
+  add_proto qw/unsigned int aom_highbd_12_variance32x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/aom_highbd_12_variance32x8 sse2 avx2/;
+  add_proto qw/unsigned int aom_highbd_12_variance16x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/aom_highbd_12_variance16x64 sse2 avx2/;
+  add_proto qw/unsigned int aom_highbd_12_variance64x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/aom_highbd_12_variance64x16 sse2 avx2/;
+
   add_proto qw/unsigned int aom_highbd_10_variance128x128/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/aom_highbd_10_variance128x128 sse2 avx2/;
 
@@ -656,49 +665,67 @@
   add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
   add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
 
+  add_proto qw/unsigned int aom_highbd_10_variance8x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/aom_highbd_10_variance8x32 sse2 avx2/;
+  add_proto qw/unsigned int aom_highbd_10_variance32x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/aom_highbd_10_variance32x8 sse2 avx2/;
+  add_proto qw/unsigned int aom_highbd_10_variance16x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/aom_highbd_10_variance16x64 sse2 avx2/;
+  add_proto qw/unsigned int aom_highbd_10_variance64x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/aom_highbd_10_variance64x16 sse2 avx2/;
+
   add_proto qw/unsigned int aom_highbd_8_variance128x128/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_8_variance128x128 sse2/;
+  specialize qw/aom_highbd_8_variance128x128 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_8_variance128x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_8_variance128x64 sse2/;
+  specialize qw/aom_highbd_8_variance128x64 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_8_variance64x128/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_8_variance64x128 sse2/;
+  specialize qw/aom_highbd_8_variance64x128 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_8_variance64x64 sse2/;
+  specialize qw/aom_highbd_8_variance64x64 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_8_variance64x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_8_variance64x32 sse2/;
+  specialize qw/aom_highbd_8_variance64x32 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_8_variance32x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_8_variance32x64 sse2/;
+  specialize qw/aom_highbd_8_variance32x64 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_8_variance32x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_8_variance32x32 sse2/;
+  specialize qw/aom_highbd_8_variance32x32 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_8_variance32x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_8_variance32x16 sse2/;
+  specialize qw/aom_highbd_8_variance32x16 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_8_variance16x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_8_variance16x32 sse2/;
+  specialize qw/aom_highbd_8_variance16x32 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_8_variance16x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_8_variance16x16 sse2/;
+  specialize qw/aom_highbd_8_variance16x16 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_8_variance16x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_8_variance16x8 sse2/;
+  specialize qw/aom_highbd_8_variance16x8 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_8_variance8x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_8_variance8x16 sse2/;
+  specialize qw/aom_highbd_8_variance8x16 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_8_variance8x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_highbd_8_variance8x8 sse2/;
+  specialize qw/aom_highbd_8_variance8x8 sse2 avx2/;
 
   add_proto qw/unsigned int aom_highbd_8_variance8x4/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
   add_proto qw/unsigned int aom_highbd_8_variance4x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
   add_proto qw/unsigned int aom_highbd_8_variance4x4/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
 
+  add_proto qw/unsigned int aom_highbd_8_variance8x32/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/aom_highbd_8_variance8x32 sse2 avx2/;
+  add_proto qw/unsigned int aom_highbd_8_variance32x8/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/aom_highbd_8_variance32x8 sse2 avx2/;
+  add_proto qw/unsigned int aom_highbd_8_variance16x64/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/aom_highbd_8_variance16x64 sse2 avx2/;
+  add_proto qw/unsigned int aom_highbd_8_variance64x16/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/aom_highbd_8_variance64x16 sse2 avx2/;
+
   add_proto qw/void aom_highbd_8_get16x16var/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
   add_proto qw/void aom_highbd_8_get8x8var/, "const uint16_t *src_ptr, int source_stride, const uint16_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
 
@@ -744,37 +771,47 @@
   # Subpixel Variance
   #
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x128/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  # specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2 avx2/;
   specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  # specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2 avx2/;
   specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x128/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  # specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2 avx2/;
   specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  # specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2 avx2/;
   specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  # specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2 avx2/;
   specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  # specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2 avx2/;
   specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  # specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2 avx2/;
   specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  # specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2 avx2/;
   specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  # specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2 avx2/;
   specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  # specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2 avx2/;
   specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/;
+  specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2 avx2/;
 
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/;
@@ -788,6 +825,23 @@
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
 
+
+  add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  # specialize qw/aom_highbd_12_sub_pixel_variance64x16 sse2 avx2/;
+  specialize qw/aom_highbd_12_sub_pixel_variance64x16 sse2/;
+
+  add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  # specialize qw/aom_highbd_12_sub_pixel_variance16x64 sse2 avx2/;
+  specialize qw/aom_highbd_12_sub_pixel_variance16x64 sse2/;
+
+  add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  # specialize qw/aom_highbd_12_sub_pixel_variance32x8 sse2 avx2/;
+  specialize qw/aom_highbd_12_sub_pixel_variance32x8 sse2/;
+
+  add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/aom_highbd_12_sub_pixel_variance16x4 sse2 avx2/;
+
+
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x128/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2 avx2/;
 
@@ -822,10 +876,10 @@
   specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2 avx2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2 avx2/;
+  specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2 avx2/;
+  specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2/;
 
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/;
@@ -833,38 +887,52 @@
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
   add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
 
+
+  add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/aom_highbd_10_sub_pixel_variance64x16 sse2 avx2/;
+
+  add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/aom_highbd_10_sub_pixel_variance16x64 sse2 avx2/;
+
+  add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/aom_highbd_10_sub_pixel_variance32x8 sse2 avx2/;
+
+  add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/aom_highbd_10_sub_pixel_variance16x4 sse2 avx2/;
+
+
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x128/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2/;
+  specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2 avx2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2/;
+  specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2 avx2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x128/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2/;
+  specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2 avx2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/;
+  specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2 avx2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/;
+  specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2 avx2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/;
+  specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2 avx2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/;
+  specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2 avx2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/;
+  specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2 avx2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/;
+  specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2 avx2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/;
+  specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2 avx2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/;
+  specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2 avx2/;
 
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/;
@@ -878,6 +946,23 @@
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
 
+
+  add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x16/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/aom_highbd_8_sub_pixel_variance64x16 sse2 avx2/;
+
+  add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/aom_highbd_8_sub_pixel_variance16x64 sse2 avx2/;
+
+  add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x8/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/aom_highbd_8_sub_pixel_variance32x8 sse2 avx2/;
+
+  add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x4/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/aom_highbd_8_sub_pixel_variance16x4 sse2 avx2/;
+
+  #
+  # Subpixel Avg Variance
+  #
+
   add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint16_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint16_t *ref_ptr, int ref_stride, uint32_t *sse, const uint16_t *second_pred";
   specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/;
 
diff --git a/aom_dsp/x86/highbd_variance_avx2.c b/aom_dsp/x86/highbd_variance_avx2.c
index 199c22a..008eb46 100644
--- a/aom_dsp/x86/highbd_variance_avx2.c
+++ b/aom_dsp/x86/highbd_variance_avx2.c
@@ -21,11 +21,12 @@
                                    const uint16_t *ref, int ref_stride,
                                    uint32_t *sse, int *sum);
 
-static uint32_t aom_highbd_var_filter_block2d_bil_avx2(
+// TODO(any): need to support 12-bit
+static AOM_FORCE_INLINE void aom_highbd_var_filter_block2d_bil_avx2(
     const uint16_t *src_ptr, unsigned int src_pixels_per_line, int pixel_step,
     unsigned int output_height, unsigned int output_width,
     const uint32_t xoffset, const uint32_t yoffset, const uint16_t *dst_ptr,
-    int dst_stride, uint32_t *sse) {
+    int dst_stride, uint64_t *sse, int64_t *sum) {
   const __m256i filter1 =
       _mm256_set1_epi32((uint32_t)(bilinear_filters_2t[xoffset][1] << 16) |
                         bilinear_filters_2t[xoffset][0]);
@@ -40,7 +41,7 @@
   uint16_t *dst_ptr_ref = (uint16_t *)dst_ptr;
   int64_t sum_long = 0;
   uint64_t sse_long = 0;
-  unsigned int rshift = 0, inc = 1;
+  unsigned int inc = 1;
   __m256i rbias = _mm256_set1_epi32(bitshift);
   __m256i opointer[8];
   unsigned int range;
@@ -82,9 +83,6 @@
         sum_long += _mm_extract_epi32(v_d, 0);
         sse_long += _mm_extract_epi32(v_d, 1);
       }
-
-      rshift = get_msb(output_height) + get_msb(output_width);
-
     } else if (yoffset == 4) {  // xoffset==0 && yoffset==4
       range = output_width / 16;
       if (output_height == 8) inc = 2;
@@ -131,9 +129,6 @@
         sum_long += _mm_extract_epi32(v_d, 0);
         sse_long += _mm_extract_epi32(v_d, 1);
       }
-
-      rshift = get_msb(output_height) + get_msb(output_width);
-
     } else {  // xoffset==0 && yoffset==1,2,3,5,6,7
       range = output_width / 16;
       if (output_height == 8) inc = 2;
@@ -195,8 +190,6 @@
         sum_long += _mm_extract_epi32(v_d, 0);
         sse_long += _mm_extract_epi32(v_d, 1);
       }
-
-      rshift = get_msb(output_height) + get_msb(output_width);
     }
   } else if (xoffset == 4) {
     if (yoffset == 0) {  // xoffset==4 && yoffset==0
@@ -266,9 +259,6 @@
         sum_long += _mm_extract_epi32(v_d, 0);
         sse_long += _mm_extract_epi32(v_d, 1);
       }
-
-      rshift = get_msb(output_height) + get_msb(output_width);
-
     } else if (yoffset == 4) {  // xoffset==4 && yoffset==4
       range = output_width / 16;
       if (output_height == 8) inc = 2;
@@ -318,9 +308,6 @@
         sum_long += _mm_extract_epi32(v_d, 0);
         sse_long += _mm_extract_epi32(v_d, 1);
       }
-
-      rshift = get_msb(output_height) + get_msb(output_width);
-
     } else {  // xoffset==4 && yoffset==1,2,3,5,6,7
       range = output_width / 16;
       if (output_height == 8) inc = 2;
@@ -386,8 +373,6 @@
         sum_long += _mm_extract_epi32(v_d, 0);
         sse_long += _mm_extract_epi32(v_d, 1);
       }
-
-      rshift = get_msb(output_height) + get_msb(output_width);
     }
   } else if (yoffset == 0) {  // xoffset==1,2,3,5,6,7 && yoffset==0
     range = output_width / 16;
@@ -440,9 +425,6 @@
       sum_long += _mm_extract_epi32(v_d, 0);
       sse_long += _mm_extract_epi32(v_d, 1);
     }
-
-    rshift = get_msb(output_height) + get_msb(output_width);
-
   } else if (yoffset == 4) {  // xoffset==1,2,3,5,6,7 && yoffset==4
 
     range = output_width / 16;
@@ -517,9 +499,6 @@
       sum_long += _mm_extract_epi32(v_d, 0);
       sse_long += _mm_extract_epi32(v_d, 1);
     }
-
-    rshift = get_msb(output_height) + get_msb(output_width);
-
   } else {  // xoffset==1,2,3,5,6,7 && yoffset==1,2,3,5,6,7
     range = output_width / 16;
     if (output_height == 8) inc = 2;
@@ -605,16 +584,10 @@
       sum_long += _mm_extract_epi32(v_d, 0);
       sse_long += _mm_extract_epi32(v_d, 1);
     }
-
-    rshift = get_msb(output_height) + get_msb(output_width);
   }
 
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
-  int sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
-
-  int32_t var = *sse - (uint32_t)(((int64_t)sum * sum) >> rshift);
-
-  return (var > 0) ? var : 0;
+  *sse = sse_long;
+  *sum = sum_long;
 }
 
 void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
@@ -652,6 +625,8 @@
   *sse = _mm_extract_epi32(v_d, 1);
 }
 
+// TODO(any): Rewrite this function to make it work for 12-bit input
+// Overflows for 12-bit inputs
 void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride,
                                   const uint16_t *ref, int ref_stride,
                                   uint32_t *sse, int *sum) {
@@ -680,13 +655,13 @@
   *sse = _mm_extract_epi32(v_d, 1);
 }
 
-static void highbd_10_variance_avx2(const uint16_t *src, int src_stride,
-                                    const uint16_t *ref, int ref_stride, int w,
-                                    int h, uint32_t *sse, int *sum,
-                                    high_variance_fn_t var_fn, int block_size) {
+static AOM_FORCE_INLINE void highbd_variance_avx2(
+    const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+    int w, int h, uint64_t *sse, int64_t *sum, high_variance_fn_t var_fn,
+    int block_size) {
   int i, j;
   uint64_t sse_long = 0;
-  int32_t sum_long = 0;
+  int64_t sum_long = 0;
 
   for (i = 0; i < h; i += block_size) {
     for (j = 0; j < w; j += block_size) {
@@ -698,10 +673,67 @@
       sum_long += sum0;
     }
   }
-  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+  *sum = sum_long;
+  *sse = sse_long;
+}
+
+static AOM_INLINE void highbd_12_variance_avx2(
+    const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+    int w, int h, uint32_t *sse, int *sum, high_variance_fn_t var_fn,
+    int block_size) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+
+  highbd_variance_avx2(src, src_stride, ref, ref_stride, w, h, &sse_long,
+                       &sum_long, var_fn, block_size);
+
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+static AOM_INLINE void highbd_10_variance_avx2(
+    const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+    int w, int h, uint32_t *sse, int *sum, high_variance_fn_t var_fn,
+    int block_size) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+
+  highbd_variance_avx2(src, src_stride, ref, ref_stride, w, h, &sse_long,
+                       &sum_long, var_fn, block_size);
+
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
 }
 
+static AOM_INLINE void highbd_8_variance_avx2(
+    const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+    int w, int h, uint32_t *sse, int *sum, high_variance_fn_t var_fn,
+    int block_size) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+
+  highbd_variance_avx2(src, src_stride, ref, ref_stride, w, h, &sse_long,
+                       &sum_long, var_fn, block_size);
+
+  *sum = (int)sum_long;
+  *sse = (uint32_t)sse_long;
+}
+
+// The 12-bit function is separated out because aom_highbd_calc16x16var_avx2
+// currently cannot handle 12-bit inputs
+#define VAR_FN_BD12(w, h, block_size, shift)                               \
+  uint32_t aom_highbd_12_variance##w##x##h##_avx2(                         \
+      const uint16_t *src, int src_stride, const uint16_t *ref,            \
+      int ref_stride, uint32_t *sse) {                                     \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    highbd_12_variance_avx2(                                               \
+        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
+        aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }
+
 #define VAR_FN(w, h, block_size, shift)                                    \
   uint32_t aom_highbd_10_variance##w##x##h##_avx2(                         \
       const uint16_t *src, int src_stride, const uint16_t *ref,            \
@@ -713,6 +745,17 @@
         aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \
     var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
     return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }                                                                        \
+  uint32_t aom_highbd_8_variance##w##x##h##_avx2(                          \
+      const uint16_t *src, int src_stride, const uint16_t *ref,            \
+      int ref_stride, uint32_t *sse) {                                     \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    highbd_8_variance_avx2(                                                \
+        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
+        aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
   }
 
 VAR_FN(128, 128, 16, 14);
@@ -726,54 +769,125 @@
 VAR_FN(16, 32, 16, 9);
 VAR_FN(16, 16, 16, 8);
 VAR_FN(16, 8, 8, 7);
-VAR_FN(16, 4, 16, 6);
+VAR_FN(8, 8, 8, 6);
+
 VAR_FN(8, 32, 8, 8);
 VAR_FN(32, 8, 8, 8);
 VAR_FN(16, 64, 16, 10);
 VAR_FN(64, 16, 16, 10);
 VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
 
+VAR_FN_BD12(128, 128, 8, 14);
+VAR_FN_BD12(128, 64, 8, 13);
+VAR_FN_BD12(64, 128, 8, 13);
+VAR_FN_BD12(64, 64, 8, 12);
+VAR_FN_BD12(64, 32, 8, 11);
+VAR_FN_BD12(32, 64, 8, 11);
+VAR_FN_BD12(32, 32, 8, 10);
+VAR_FN_BD12(32, 16, 8, 9);
+VAR_FN_BD12(16, 32, 8, 9);
+VAR_FN_BD12(16, 16, 8, 8);
+VAR_FN_BD12(16, 8, 8, 7);
+VAR_FN_BD12(8, 8, 8, 6);
+
+VAR_FN_BD12(8, 32, 8, 8);
+VAR_FN_BD12(32, 8, 8, 8);
+VAR_FN_BD12(16, 64, 8, 10);
+VAR_FN_BD12(64, 16, 8, 10);
+VAR_FN_BD12(8, 16, 8, 7);
 #undef VAR_FN
+#undef VAR_FN_BD12
 
-#define SSE2_Height(H)                                                 \
-  uint32_t aom_highbd_10_sub_pixel_variance8x##H##_sse2(               \
-      const uint16_t *src, int src_stride, int x_offset, int y_offset, \
-      const uint16_t *dst, int dst_stride, uint32_t *sse_ptr);
-
-SSE2_Height(8);
-SSE2_Height(16);
-#undef SSE2_Height
-
-#define HIGHBD_SUBPIX_VAR(W, H)                                              \
-  uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2(                 \
-      const uint16_t *src, int src_stride, int xoffset, int yoffset,         \
-      const uint16_t *dst, int dst_stride, uint32_t *sse) {                  \
-    if (W == 8 && H == 16)                                                   \
-      return aom_highbd_10_sub_pixel_variance8x16_sse2(                      \
-          src, src_stride, xoffset, yoffset, dst, dst_stride, sse);          \
-    else if (W == 8 && H == 8)                                               \
-      return aom_highbd_10_sub_pixel_variance8x8_sse2(                       \
-          src, src_stride, xoffset, yoffset, dst, dst_stride, sse);          \
-    else                                                                     \
-      return aom_highbd_var_filter_block2d_bil_avx2(                         \
-          src, src_stride, 1, H, W, xoffset, yoffset, dst, dst_stride, sse); \
+// The 12-bit function is separated out because
+// aom_highbd_var_filter_block2d_bil_avx2 overflows when bsize \geq 16X16
+#define HIGHBD_SUBPIX_VAR_BD12(W, H, rshift)                                  \
+  uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_avx2(                  \
+      const uint16_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint16_t *dst, int dst_stride, uint32_t *sse) {                   \
+    uint64_t sse_long = 0;                                                    \
+    int64_t sum = 0;                                                          \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_avx2(src, src_stride, 1, H, W, xoffset, \
+                                           yoffset, dst, dst_stride,          \
+                                           &sse_long, &sum);                  \
+                                                                              \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                         \
+    sum = ROUND_POWER_OF_TWO(sum, 4);                                         \
+                                                                              \
+    int32_t var = *sse - (uint32_t)((sum * sum) >> rshift);                   \
+                                                                              \
+    return (var > 0) ? var : 0;                                               \
   }
 
-HIGHBD_SUBPIX_VAR(128, 128);
-HIGHBD_SUBPIX_VAR(128, 64);
-HIGHBD_SUBPIX_VAR(64, 128);
-HIGHBD_SUBPIX_VAR(64, 64);
-HIGHBD_SUBPIX_VAR(64, 32);
-HIGHBD_SUBPIX_VAR(32, 64);
-HIGHBD_SUBPIX_VAR(32, 32);
-HIGHBD_SUBPIX_VAR(32, 16);
-HIGHBD_SUBPIX_VAR(16, 32);
-HIGHBD_SUBPIX_VAR(16, 16);
-HIGHBD_SUBPIX_VAR(16, 8);
-HIGHBD_SUBPIX_VAR(8, 16);
-HIGHBD_SUBPIX_VAR(8, 8);
+#define HIGHBD_SUBPIX_VAR(W, H, rshift)                                       \
+  uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2(                  \
+      const uint16_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint16_t *dst, int dst_stride, uint32_t *sse) {                   \
+    uint64_t sse_long = 0;                                                    \
+    int64_t sum = 0;                                                          \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_avx2(src, src_stride, 1, H, W, xoffset, \
+                                           yoffset, dst, dst_stride,          \
+                                           &sse_long, &sum);                  \
+                                                                              \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);                         \
+    sum = ROUND_POWER_OF_TWO(sum, 2);                                         \
+                                                                              \
+    int32_t var = *sse - (uint32_t)((sum * sum) >> rshift);                   \
+                                                                              \
+    return (var > 0) ? var : 0;                                               \
+  }                                                                           \
+  uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_avx2(                   \
+      const uint16_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint16_t *dst, int dst_stride, uint32_t *sse) {                   \
+    uint64_t sse_long = 0;                                                    \
+    int64_t sum = 0;                                                          \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_avx2(src, src_stride, 1, H, W, xoffset, \
+                                           yoffset, dst, dst_stride,          \
+                                           &sse_long, &sum);                  \
+                                                                              \
+    *sse = (uint32_t)sse_long;                                                \
+    int32_t var = *sse - (uint32_t)((sum * sum) >> rshift);                   \
+                                                                              \
+    return (var > 0) ? var : 0;                                               \
+  }
+
+HIGHBD_SUBPIX_VAR(128, 128, 14);
+HIGHBD_SUBPIX_VAR(128, 64, 13);
+HIGHBD_SUBPIX_VAR(64, 128, 13);
+HIGHBD_SUBPIX_VAR(64, 64, 12);
+HIGHBD_SUBPIX_VAR(64, 32, 11);
+HIGHBD_SUBPIX_VAR(32, 64, 11);
+HIGHBD_SUBPIX_VAR(32, 32, 10);
+HIGHBD_SUBPIX_VAR(32, 16, 9);
+HIGHBD_SUBPIX_VAR(16, 32, 9);
+HIGHBD_SUBPIX_VAR(16, 16, 8);
+HIGHBD_SUBPIX_VAR(16, 8, 7);
+
+HIGHBD_SUBPIX_VAR(64, 16, 10);
+HIGHBD_SUBPIX_VAR(16, 64, 10);
+HIGHBD_SUBPIX_VAR(32, 8, 8);
+HIGHBD_SUBPIX_VAR(16, 4, 6);
+
+// HIGHBD_SUBPIX_VAR_BD12(128, 128, 14);
+// HIGHBD_SUBPIX_VAR_BD12(128, 64, 13);
+// HIGHBD_SUBPIX_VAR_BD12(64, 128, 13);
+// HIGHBD_SUBPIX_VAR_BD12(64, 64, 12);
+// HIGHBD_SUBPIX_VAR_BD12(64, 32, 11);
+// HIGHBD_SUBPIX_VAR_BD12(32, 64, 11);
+// HIGHBD_SUBPIX_VAR_BD12(32, 32, 10);
+// HIGHBD_SUBPIX_VAR_BD12(32, 16, 9);
+// HIGHBD_SUBPIX_VAR_BD12(16, 32, 9);
+// HIGHBD_SUBPIX_VAR_BD12(16, 16, 8);
+HIGHBD_SUBPIX_VAR_BD12(16, 8, 7);
+
+// HIGHBD_SUBPIX_VAR_BD12(64, 16, 10);
+// HIGHBD_SUBPIX_VAR_BD12(16, 64, 10);
+// HIGHBD_SUBPIX_VAR_BD12(32, 8, 8);
+HIGHBD_SUBPIX_VAR_BD12(16, 4, 6);
 #undef HIGHBD_SUBPIX_VAR
+#undef HIGHBD_SUBPIX_VAR_BD12
 
 uint64_t aom_mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
                                        uint16_t *src, int sstride, int h) {
diff --git a/test/variance_test.cc b/test/variance_test.cc
index a706700..1c23d68 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -407,7 +407,8 @@
   aom_usec_timer_mark(&timer);
   const double elapsed_time =
       static_cast<double>(aom_usec_timer_elapsed(&timer));
-  printf("Variance %dx%d : %7.2fns\n", width(), height(), elapsed_time);
+  printf("Bitdepth: %d, Variance %dx%d : %7.2fns\n", params_.bit_depth, width(),
+         height(), elapsed_time);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1357,6 +1358,25 @@
                                       10)));
 
 const VarianceParams kArrayHBDVariance_avx2[] = {
+  VarianceParams(7, 7, &aom_highbd_8_variance128x128_avx2, 8),
+  VarianceParams(7, 6, &aom_highbd_8_variance128x64_avx2, 8),
+  VarianceParams(6, 7, &aom_highbd_8_variance64x128_avx2, 8),
+  VarianceParams(6, 6, &aom_highbd_8_variance64x64_avx2, 8),
+  VarianceParams(6, 5, &aom_highbd_8_variance64x32_avx2, 8),
+  VarianceParams(5, 6, &aom_highbd_8_variance32x64_avx2, 8),
+  VarianceParams(5, 5, &aom_highbd_8_variance32x32_avx2, 8),
+  VarianceParams(5, 4, &aom_highbd_8_variance32x16_avx2, 8),
+  VarianceParams(4, 5, &aom_highbd_8_variance16x32_avx2, 8),
+  VarianceParams(4, 4, &aom_highbd_8_variance16x16_avx2, 8),
+  VarianceParams(4, 3, &aom_highbd_8_variance16x8_avx2, 8),
+  VarianceParams(3, 4, &aom_highbd_8_variance8x16_avx2, 8),
+  VarianceParams(3, 3, &aom_highbd_8_variance8x8_avx2, 8),
+
+  VarianceParams(3, 5, &aom_highbd_8_variance8x32_avx2, 8),
+  VarianceParams(5, 3, &aom_highbd_8_variance32x8_avx2, 8),
+  VarianceParams(4, 6, &aom_highbd_8_variance16x64_avx2, 8),
+  VarianceParams(6, 4, &aom_highbd_8_variance64x16_avx2, 8),
+
   VarianceParams(7, 7, &aom_highbd_10_variance128x128_avx2, 10),
   VarianceParams(7, 6, &aom_highbd_10_variance128x64_avx2, 10),
   VarianceParams(6, 7, &aom_highbd_10_variance64x128_avx2, 10),
@@ -1370,12 +1390,58 @@
   VarianceParams(4, 3, &aom_highbd_10_variance16x8_avx2, 10),
   VarianceParams(3, 4, &aom_highbd_10_variance8x16_avx2, 10),
   VarianceParams(3, 3, &aom_highbd_10_variance8x8_avx2, 10),
+
+  VarianceParams(3, 5, &aom_highbd_10_variance8x32_avx2, 10),
+  VarianceParams(5, 3, &aom_highbd_10_variance32x8_avx2, 10),
+  VarianceParams(4, 6, &aom_highbd_10_variance16x64_avx2, 10),
+  VarianceParams(6, 4, &aom_highbd_10_variance64x16_avx2, 10),
+
+  VarianceParams(7, 7, &aom_highbd_12_variance128x128_avx2, 12),
+  VarianceParams(7, 6, &aom_highbd_12_variance128x64_avx2, 12),
+  VarianceParams(6, 7, &aom_highbd_12_variance64x128_avx2, 12),
+  VarianceParams(6, 6, &aom_highbd_12_variance64x64_avx2, 12),
+  VarianceParams(6, 5, &aom_highbd_12_variance64x32_avx2, 12),
+  VarianceParams(5, 6, &aom_highbd_12_variance32x64_avx2, 12),
+  VarianceParams(5, 5, &aom_highbd_12_variance32x32_avx2, 12),
+  VarianceParams(5, 4, &aom_highbd_12_variance32x16_avx2, 12),
+  VarianceParams(4, 5, &aom_highbd_12_variance16x32_avx2, 12),
+  VarianceParams(4, 4, &aom_highbd_12_variance16x16_avx2, 12),
+  VarianceParams(4, 3, &aom_highbd_12_variance16x8_avx2, 12),
+  VarianceParams(3, 4, &aom_highbd_12_variance8x16_avx2, 12),
+  VarianceParams(3, 3, &aom_highbd_12_variance8x8_avx2, 12),
+
+  VarianceParams(3, 5, &aom_highbd_12_variance8x32_avx2, 12),
+  VarianceParams(5, 3, &aom_highbd_12_variance32x8_avx2, 12),
+  VarianceParams(4, 6, &aom_highbd_12_variance16x64_avx2, 12),
+  VarianceParams(6, 4, &aom_highbd_12_variance64x16_avx2, 12),
 };
 
 INSTANTIATE_TEST_SUITE_P(AVX2, AvxHBDVarianceTest,
                          ::testing::ValuesIn(kArrayHBDVariance_avx2));
 
 const SubpelVarianceParams kArrayHBDSubpelVariance_avx2[] = {
+  // SubpelVarianceParams(7, 7, &aom_highbd_12_sub_pixel_variance128x128_avx2,
+  // 12),
+  // SubpelVarianceParams(7, 6, &aom_highbd_12_sub_pixel_variance128x64_avx2,
+  // 12),
+  // SubpelVarianceParams(6, 7, &aom_highbd_12_sub_pixel_variance64x128_avx2,
+  // 12),
+  // SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_avx2,
+  // 12),
+  // SubpelVarianceParams(6, 5, &aom_highbd_12_sub_pixel_variance64x32_avx2,
+  // 12),
+  // SubpelVarianceParams(5, 6, &aom_highbd_12_sub_pixel_variance32x64_avx2,
+  // 12),
+  // SubpelVarianceParams(5, 5, &aom_highbd_12_sub_pixel_variance32x32_avx2,
+  // 12),
+  // SubpelVarianceParams(5, 4, &aom_highbd_12_sub_pixel_variance32x16_avx2,
+  // 12),
+  // SubpelVarianceParams(4, 5, &aom_highbd_12_sub_pixel_variance16x32_avx2,
+  // 12),
+  // SubpelVarianceParams(4, 4, &aom_highbd_12_sub_pixel_variance16x16_avx2,
+  // 12),
+  SubpelVarianceParams(4, 3, &aom_highbd_12_sub_pixel_variance16x8_avx2, 12),
+
   SubpelVarianceParams(7, 7, &aom_highbd_10_sub_pixel_variance128x128_avx2, 10),
   SubpelVarianceParams(7, 6, &aom_highbd_10_sub_pixel_variance128x64_avx2, 10),
   SubpelVarianceParams(6, 7, &aom_highbd_10_sub_pixel_variance64x128_avx2, 10),
@@ -1387,8 +1453,35 @@
   SubpelVarianceParams(4, 5, &aom_highbd_10_sub_pixel_variance16x32_avx2, 10),
   SubpelVarianceParams(4, 4, &aom_highbd_10_sub_pixel_variance16x16_avx2, 10),
   SubpelVarianceParams(4, 3, &aom_highbd_10_sub_pixel_variance16x8_avx2, 10),
-  SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_avx2, 10),
-  SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_avx2, 10),
+
+  SubpelVarianceParams(7, 7, &aom_highbd_8_sub_pixel_variance128x128_avx2, 8),
+  SubpelVarianceParams(7, 6, &aom_highbd_8_sub_pixel_variance128x64_avx2, 8),
+  SubpelVarianceParams(6, 7, &aom_highbd_8_sub_pixel_variance64x128_avx2, 8),
+  SubpelVarianceParams(6, 6, &aom_highbd_8_sub_pixel_variance64x64_avx2, 8),
+  SubpelVarianceParams(6, 5, &aom_highbd_8_sub_pixel_variance64x32_avx2, 8),
+  SubpelVarianceParams(5, 6, &aom_highbd_8_sub_pixel_variance32x64_avx2, 8),
+  SubpelVarianceParams(5, 5, &aom_highbd_8_sub_pixel_variance32x32_avx2, 8),
+  SubpelVarianceParams(5, 4, &aom_highbd_8_sub_pixel_variance32x16_avx2, 8),
+  SubpelVarianceParams(4, 5, &aom_highbd_8_sub_pixel_variance16x32_avx2, 8),
+  SubpelVarianceParams(4, 4, &aom_highbd_8_sub_pixel_variance16x16_avx2, 8),
+  SubpelVarianceParams(4, 3, &aom_highbd_8_sub_pixel_variance16x8_avx2, 8),
+
+  // SubpelVarianceParams(6, 4, &aom_highbd_12_sub_pixel_variance64x16_avx2,
+  // 12),
+  // SubpelVarianceParams(4, 6, &aom_highbd_12_sub_pixel_variance16x64_avx2,
+  // 12),
+  // SubpelVarianceParams(5, 3, &aom_highbd_12_sub_pixel_variance32x8_avx2, 12),
+  SubpelVarianceParams(4, 2, &aom_highbd_12_sub_pixel_variance16x4_avx2, 12),
+
+  SubpelVarianceParams(6, 4, &aom_highbd_10_sub_pixel_variance64x16_avx2, 10),
+  SubpelVarianceParams(4, 6, &aom_highbd_10_sub_pixel_variance16x64_avx2, 10),
+  SubpelVarianceParams(5, 3, &aom_highbd_10_sub_pixel_variance32x8_avx2, 10),
+  SubpelVarianceParams(4, 2, &aom_highbd_10_sub_pixel_variance16x4_avx2, 10),
+
+  SubpelVarianceParams(6, 4, &aom_highbd_8_sub_pixel_variance64x16_avx2, 8),
+  SubpelVarianceParams(4, 6, &aom_highbd_8_sub_pixel_variance16x64_avx2, 8),
+  SubpelVarianceParams(5, 3, &aom_highbd_8_sub_pixel_variance32x8_avx2, 8),
+  SubpelVarianceParams(4, 2, &aom_highbd_8_sub_pixel_variance16x4_avx2, 8),
 };
 
 INSTANTIATE_TEST_SUITE_P(AVX2, AvxHBDSubpelVarianceTest,