Optimize quantize_fp_neon function for LBD and HBD

Optimized EOB calculation by removing iscan adjustment in
get_max_lane_eob(). Move this logic to get_max_eob(), which adds 1 to
the maximum value.

Use -1 instead of 0 when comparing against iscan values, to
differentiate between the case when non-zero element is at position 0
and that one where no non-zero value exists.

This is a port from SVT-AV1:
gitlab.com/AOMediaCodec/SVT-AV1/-/commit/f86adc26b60e5796f560ac1971962267b3a63ed4

Change-Id: I500eb61d508284b439f1efceac42595bff5fb520
diff --git a/av1/encoder/arm/av1_highbd_quantize_neon.c b/av1/encoder/arm/av1_highbd_quantize_neon.c
index c1016db..fd4749d 100644
--- a/av1/encoder/arm/av1_highbd_quantize_neon.c
+++ b/av1/encoder/arm/av1_highbd_quantize_neon.c
@@ -60,15 +60,15 @@
 static inline int16x8_t get_max_lane_eob(const int16_t *iscan,
                                          int16x8_t v_eobmax,
                                          uint16x8_t v_mask) {
-  const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
-  const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
-  const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
+  const int16x8_t v_iscan = vld1q_s16(iscan);
+  const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan, vdupq_n_s16(-1));
   return vmaxq_s16(v_eobmax, v_nz_iscan);
 }
 
 static inline uint16_t get_max_eob(int16x8_t v_eobmax) {
 #if AOM_ARCH_AARCH64
-  return (uint16_t)vmaxvq_s16(v_eobmax);
+  int16_t max_val = vmaxvq_s16(v_eobmax);
+  return (uint16_t)max_val + 1;
 #else
   const int16x4_t v_eobmax_3210 =
       vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
@@ -80,7 +80,7 @@
       vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
   const int16x4_t v_eobmax_final =
       vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
-  return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+  return (uint16_t)vget_lane_s16(v_eobmax_final, 0) + 1;
 #endif
 }
 
diff --git a/av1/encoder/arm/quantize_neon.c b/av1/encoder/arm/quantize_neon.c
index cbeafc7..8d13e57 100644
--- a/av1/encoder/arm/quantize_neon.c
+++ b/av1/encoder/arm/quantize_neon.c
@@ -29,7 +29,8 @@
 
 static inline uint16_t get_max_eob(int16x8_t v_eobmax) {
 #if AOM_ARCH_AARCH64
-  return (uint16_t)vmaxvq_s16(v_eobmax);
+  int16_t max_val = vmaxvq_s16(v_eobmax);
+  return (uint16_t)max_val + 1;
 #else
   const int16x4_t v_eobmax_3210 =
       vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
@@ -41,16 +42,15 @@
       vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
   const int16x4_t v_eobmax_final =
       vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
-  return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+  return (uint16_t)vget_lane_s16(v_eobmax_final, 0) + 1;
 #endif
 }
 
 static inline int16x8_t get_max_lane_eob(const int16_t *iscan,
                                          int16x8_t v_eobmax,
                                          uint16x8_t v_mask) {
-  const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
-  const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
-  const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
+  const int16x8_t v_iscan = vld1q_s16(iscan);
+  const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan, vdupq_n_s16(-1));
   return vmaxq_s16(v_eobmax, v_nz_iscan);
 }
 
@@ -445,7 +445,7 @@
       v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
     }
   }
-  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+  *eob_ptr = get_max_eob(v_eobmax_76543210);
 }
 
 #define QM_MULL_SHIFT(x0, x1)                                              \
@@ -580,7 +580,7 @@
       v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
     }
   }
-  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+  *eob_ptr = get_max_eob(v_eobmax_76543210);
 }
 
 static void aom_quantize_b_helper_32x32_neon(
@@ -718,7 +718,7 @@
       v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
     }
   }
-  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+  *eob_ptr = get_max_eob(v_eobmax_76543210);
 }
 
 static void aom_quantize_b_helper_64x64_neon(
@@ -867,7 +867,7 @@
       v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
     }
   }
-  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+  *eob_ptr = get_max_eob(v_eobmax_76543210);
 }
 
 void aom_quantize_b_helper_neon(