mse_8xh_16bit_neon: fix src/dst loads increment src/dst rather than using `i * stride`. this fixes a bug introduced when the loop was change to decrement i in: 680652b62 Count down in Neon variance loops Bug: aomedia:3361 Fixed aomedia:3361 Change-Id: I9d579045c3ec1ce83cbaf23b4526b8f48b9a4dfe

commit: af89b290e3234b10e0c73b2b18c4d11b46c25547 [log] [tgz]
author: James Zern <jzern@google.com> Fri Dec 16 11:21:54 2022 -0800
committer: James Zern <jzern@google.com> Fri Dec 16 11:21:54 2022 -0800
tree: c12a95c938c1d959a03fdeb330094e0a46a4fde6
parent: 7468af9096415f6dbfa073054ba0900a648cf948 [diff]
diff --git a/aom_dsp/arm/variance_neon.c b/aom_dsp/arm/variance_neon.c
index e2bc96b..3cc66a0 100644
--- a/aom_dsp/arm/variance_neon.c
+++ b/aom_dsp/arm/variance_neon.c

@@ -688,11 +688,14 @@
   int i = h;
   do {
     // d7 d6 d5 d4 d3 d2 d1 d0 - 8 bit
-    const uint16x8_t dst_16x8 = vmovl_u8(vld1_u8(&dst[i * dstride]));
+    const uint16x8_t dst_16x8 = vmovl_u8(vld1_u8(dst));
     // s7 s6 s5 s4 s3 s2 s1 s0 - 16 bit
-    const uint16x8_t src_16x8 = vld1q_u16(&src[i * sstride]);
+    const uint16x8_t src_16x8 = vld1q_u16(src);
 
     COMPUTE_MSE_16BIT(src_16x8, dst_16x8)
+
+    dst += dstride;
+    src += sstride;
   } while (--i != 0);
   uint64x1_t sum =
       vadd_u64(vget_high_u64(square_result), vget_low_u64(square_result));
commit	af89b290e3234b10e0c73b2b18c4d11b46c25547	[log] [tgz]
author	James Zern <jzern@google.com>	Fri Dec 16 11:21:54 2022 -0800
committer	James Zern <jzern@google.com>	Fri Dec 16 11:21:54 2022 -0800
tree	c12a95c938c1d959a03fdeb330094e0a46a4fde6
parent	7468af9096415f6dbfa073054ba0900a648cf948 [diff]