[arm]: Add aom_highbd_quantize_b{,_32x32,_64x64}_neon().

3.4x to 4.9x faster than "C" depending on the last nonzero coeff position.

Bug: b/217282899

Change-Id: I4b70e1dec8cb37a245fa68515bd384a0cb6e29cf
diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc
index fd18ffd..90957d8 100644
--- a/test/quantize_func_test.cc
+++ b/test/quantize_func_test.cc
@@ -709,6 +709,7 @@
              static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
   make_tuple(&aom_quantize_b_64x64_c, &aom_quantize_b_64x64_neon,
              static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8),
+
 #if CONFIG_AV1_HIGHBITDEPTH
   make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
              &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_neon>,
@@ -719,6 +720,12 @@
   make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
              &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_neon>,
              static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_neon,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_neon,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_neon,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_12),
 #endif
 };