AV1 RT: Fix NEON bug in av1_quantize_lp

Fix NEON implementation of av1_quantize_lp and re-enable it. Also adding
unit test

BUG=aomedia:3162

Change-Id: I8bc62c009bfb8e264a75e7bb67c475d5121bf079
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 7f2cd7e..a4e1bbf 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -330,9 +330,8 @@
   add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/av1_quantize_fp sse2 avx2 neon/;
 
-  # TODO(any): need to fix the bug in neon optimization and re-enable it.
   add_proto qw/void av1_quantize_lp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/av1_quantize_lp sse2 avx2/;
+  specialize qw/av1_quantize_lp sse2 avx2 neon/;
 
   add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/av1_quantize_fp_32x32 neon avx2/;
diff --git a/av1/encoder/arm/neon/quantize_neon.c b/av1/encoder/arm/neon/quantize_neon.c
index b0e8950..7e028f5 100644
--- a/av1/encoder/arm/neon/quantize_neon.c
+++ b/av1/encoder/arm/neon/quantize_neon.c
@@ -133,7 +133,8 @@
                           const int16_t *round_ptr, const int16_t *quant_ptr,
                           int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                          const int16_t *scan) {
+                          const int16_t *scan, const int16_t *iscan) {
+  (void)scan;
   // Quantization pass: All coefficients with index >= zero_flag are
   // skippable. Note: zero_flag can be zero.
   const int16x8_t v_zero = vdupq_n_s16(0);
@@ -149,7 +150,7 @@
   v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
   // process dc and the first seven ac coeffs
   {
-    const int16x8_t v_iscan = vld1q_s16(&scan[0]);
+    const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
     const int16x8_t v_coeff = vld1q_s16(coeff_ptr);
     const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
     const int16x8_t v_abs = vabsq_s16(v_coeff);
@@ -174,7 +175,7 @@
   }
   // now process the rest of the ac coeffs
   for (int i = 8; i < count; i += 8) {
-    const int16x8_t v_iscan = vld1q_s16(&scan[i]);
+    const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
     const int16x8_t v_coeff = vld1q_s16(coeff_ptr + i);
     const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
     const int16x8_t v_abs = vabsq_s16(v_coeff);
diff --git a/test/quantize_lp_func_test.cc b/test/quantize_lp_func_test.cc
index 898b810..f398623 100644
--- a/test/quantize_lp_func_test.cc
+++ b/test/quantize_lp_func_test.cc
@@ -348,4 +348,18 @@
                          ::testing::ValuesIn(kQParamArraySSE2));
 #endif
 
+#if HAVE_NEON
+const QuantizeParam kQParamArrayNEON[] = {
+  make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_neon,
+             static_cast<TX_SIZE>(TX_16X16), AOM_BITS_8),
+  make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_neon,
+             static_cast<TX_SIZE>(TX_8X8), AOM_BITS_8),
+  make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_neon,
+             static_cast<TX_SIZE>(TX_4X4), AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, FullPrecisionQuantizeLpTest,
+                         ::testing::ValuesIn(kQParamArrayNEON));
+#endif
+
 }  // namespace