[x86]: Add aom_quantize_b_avx2().

1.4x to 1.7x faster than aom_quantize_b_avx()
depending on the last nonzero coeff position.

Bug: b/235228922

Change-Id: I8c1f1c9a9cf77878c34049ae4221cc0c7712fe3d
diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc
index e00cc1f..4408faa 100644
--- a/test/quantize_func_test.cc
+++ b/test/quantize_func_test.cc
@@ -567,7 +567,9 @@
   make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_avx2,
              static_cast<TX_SIZE>(TX_8X8), TYPE_B, AOM_BITS_8),
   make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_avx2,
-             static_cast<TX_SIZE>(TX_4X4), TYPE_B, AOM_BITS_8)
+             static_cast<TX_SIZE>(TX_4X4), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_c, &aom_quantize_b_avx2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
 };
 
 INSTANTIATE_TEST_SUITE_P(AVX2, FullPrecisionQuantizeTest,