mse_8xh_16bit_neon: fix src/dst loads

increment src/dst rather than using `i * stride`. this fixes a bug
introduced when the loop was change to decrement i in:
680652b62 Count down in Neon variance loops

Bug: aomedia:3361
Fixed aomedia:3361

Change-Id: I9d579045c3ec1ce83cbaf23b4526b8f48b9a4dfe
diff --git a/aom_dsp/arm/variance_neon.c b/aom_dsp/arm/variance_neon.c
index e2bc96b..3cc66a0 100644
--- a/aom_dsp/arm/variance_neon.c
+++ b/aom_dsp/arm/variance_neon.c
@@ -688,11 +688,14 @@
   int i = h;
   do {
     // d7 d6 d5 d4 d3 d2 d1 d0 - 8 bit
-    const uint16x8_t dst_16x8 = vmovl_u8(vld1_u8(&dst[i * dstride]));
+    const uint16x8_t dst_16x8 = vmovl_u8(vld1_u8(dst));
     // s7 s6 s5 s4 s3 s2 s1 s0 - 16 bit
-    const uint16x8_t src_16x8 = vld1q_u16(&src[i * sstride]);
+    const uint16x8_t src_16x8 = vld1q_u16(src);
 
     COMPUTE_MSE_16BIT(src_16x8, dst_16x8)
+
+    dst += dstride;
+    src += sstride;
   } while (--i != 0);
   uint64x1_t sum =
       vadd_u64(vget_high_u64(square_result), vget_low_u64(square_result));