Simplify AArch64 SIMD implementation of svt_compute_stats_win5_*

Delete unnecessary multiplications in step 3 of the win5 helper function
and simplify it to only have one case.

This is a port of 234f4afffddf08739dbd04bebb42c05821b9015f in SVT-AV1.

Change-Id: I66d59494e6385c249e2b089573616fcd5f93ce1e
diff --git a/av1/encoder/arm/highbd_pickrst_neon.c b/av1/encoder/arm/highbd_pickrst_neon.c
index bb85a4b..499eb58 100644
--- a/av1/encoder/arm/highbd_pickrst_neon.c
+++ b/av1/encoder/arm/highbd_pickrst_neon.c
@@ -463,64 +463,34 @@
   {
     const int16_t *d_t = d;
 
-    if (height % 2) {
-      int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
-      int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
-      int16x8_t ds[WIENER_WIN * 2];
+    int32x4_t deltas[WIENER_WIN_CHROMA] = { vdupq_n_s32(0) };
+    int16x8_t ds[WIENER_WIN_CHROMA + 1];
 
-      load_s16_8x4(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6]);
-      load_s16_8x4(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7]);
-      d_t += 4 * d_stride;
+    ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
+    ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
+    ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
+    ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
 
-      step3_win5_oneline_neon(&d_t, d_stride, width, height, ds, deltas);
-      transpose_arrays_s32_8x8(deltas, deltas_tr);
+    step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
 
-      update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
-                          deltas_tr[0], vgetq_lane_s32(deltas_tr[4], 0),
-                          H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
+    transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2],
+                                    &deltas[3]);
 
-      update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
-                          deltas_tr[1], vgetq_lane_s32(deltas_tr[5], 0),
-                          H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
+    update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
+                        deltas[0], vgetq_lane_s32(deltas[4], 0),
+                        H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
 
-      update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
-                          deltas_tr[2], vgetq_lane_s32(deltas_tr[6], 0),
-                          H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
+    update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
+                        deltas[1], vgetq_lane_s32(deltas[4], 1),
+                        H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
 
-      update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
-                          deltas_tr[3], vgetq_lane_s32(deltas_tr[7], 0),
-                          H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
+    update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
+                        deltas[2], vgetq_lane_s32(deltas[4], 2),
+                        H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
 
-    } else {
-      int32x4_t deltas[WIENER_WIN_CHROMA * 2] = { vdupq_n_s32(0) };
-      int16x8_t ds[WIENER_WIN_CHROMA * 2];
-
-      ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
-      ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
-      ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
-      ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
-
-      step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
-
-      transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2],
-                                      &deltas[3]);
-
-      update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
-                          deltas[0], vgetq_lane_s32(deltas[4], 0),
-                          H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
-
-      update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
-                          deltas[1], vgetq_lane_s32(deltas[4], 1),
-                          H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
-
-      update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
-                          deltas[2], vgetq_lane_s32(deltas[4], 2),
-                          H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
-
-      update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
-                          deltas[3], vgetq_lane_s32(deltas[4], 3),
-                          H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
-    }
+    update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
+                        deltas[3], vgetq_lane_s32(deltas[4], 3),
+                        H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
   }
 
   // Step 4: Derive the top and left edge of each square. No square in top and
diff --git a/av1/encoder/arm/pickrst_neon.c b/av1/encoder/arm/pickrst_neon.c
index f83e693..70ee3b6 100644
--- a/av1/encoder/arm/pickrst_neon.c
+++ b/av1/encoder/arm/pickrst_neon.c
@@ -1027,64 +1027,34 @@
   {
     const int16_t *d_t = d;
 
-    if (height % 2) {
-      int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
-      int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
-      int16x8_t ds[WIENER_WIN * 2];
+    int32x4_t deltas[WIENER_WIN_CHROMA] = { vdupq_n_s32(0) };
+    int16x8_t ds[WIENER_WIN_CHROMA + 1];
 
-      load_s16_8x4(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6]);
-      load_s16_8x4(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7]);
-      d_t += 4 * d_stride;
+    ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
+    ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
+    ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
+    ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
 
-      step3_win5_oneline_neon(&d_t, d_stride, width, height, ds, deltas);
-      transpose_arrays_s32_8x8(deltas, deltas_tr);
+    step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
 
-      update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
-                          deltas_tr[0], vgetq_lane_s32(deltas_tr[4], 0),
-                          H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
+    transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2],
+                                    &deltas[3]);
 
-      update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
-                          deltas_tr[1], vgetq_lane_s32(deltas_tr[5], 0),
-                          H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
+    update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
+                        deltas[0], vgetq_lane_s32(deltas[4], 0),
+                        H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
 
-      update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
-                          deltas_tr[2], vgetq_lane_s32(deltas_tr[6], 0),
-                          H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
+    update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
+                        deltas[1], vgetq_lane_s32(deltas[4], 1),
+                        H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
 
-      update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
-                          deltas_tr[3], vgetq_lane_s32(deltas_tr[7], 0),
-                          H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
+    update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
+                        deltas[2], vgetq_lane_s32(deltas[4], 2),
+                        H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
 
-    } else {
-      int32x4_t deltas[WIENER_WIN_CHROMA * 2] = { vdupq_n_s32(0) };
-      int16x8_t ds[WIENER_WIN_CHROMA * 2];
-
-      ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
-      ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
-      ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
-      ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
-
-      step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
-
-      transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2],
-                                      &deltas[3]);
-
-      update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
-                          deltas[0], vgetq_lane_s32(deltas[4], 0),
-                          H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
-
-      update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
-                          deltas[1], vgetq_lane_s32(deltas[4], 1),
-                          H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
-
-      update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
-                          deltas[2], vgetq_lane_s32(deltas[4], 2),
-                          H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
-
-      update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
-                          deltas[3], vgetq_lane_s32(deltas[4], 3),
-                          H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
-    }
+    update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
+                        deltas[3], vgetq_lane_s32(deltas[4], 3),
+                        H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
   }
 
   // Step 4: Derive the top and left edge of each square. No square in top and
diff --git a/av1/encoder/arm/pickrst_neon.h b/av1/encoder/arm/pickrst_neon.h
index 356c42d..2eeece8 100644
--- a/av1/encoder/arm/pickrst_neon.h
+++ b/av1/encoder/arm/pickrst_neon.h
@@ -422,9 +422,8 @@
   dst[4] = src[4] + delta4;
 }
 
-static inline void compute_delta_step3_two_lines(int32x4_t *sum,
-                                                 const int16x8_t src,
-                                                 const int16x8_t dgd) {
+static inline void compute_delta_step3_win5(int32x4_t *sum, const int16x8_t src,
+                                            const int16x8_t dgd) {
   *sum = vmlsl_s16(*sum, vget_low_s16(src), vget_low_s16(dgd));
   *sum = vmlal_s16(*sum, vget_high_s16(src), vget_high_s16(dgd));
 }
@@ -437,16 +436,16 @@
     ds[4] = load_unaligned_s16_4x2(d + 0 * d_stride, width);
     ds[5] = load_unaligned_s16_4x2(d + 1 * d_stride, width);
 
-    compute_delta_step3_two_lines(&deltas[0], ds[0], ds[0]);
-    compute_delta_step3_two_lines(&deltas[1], ds[0], ds[1]);
-    compute_delta_step3_two_lines(&deltas[2], ds[0], ds[2]);
-    compute_delta_step3_two_lines(&deltas[3], ds[0], ds[3]);
-    compute_delta_step3_two_lines(&deltas[4], ds[0], ds[4]);
-    compute_delta_step3_two_lines(&deltas[0], ds[1], ds[1]);
-    compute_delta_step3_two_lines(&deltas[1], ds[1], ds[2]);
-    compute_delta_step3_two_lines(&deltas[2], ds[1], ds[3]);
-    compute_delta_step3_two_lines(&deltas[3], ds[1], ds[4]);
-    compute_delta_step3_two_lines(&deltas[4], ds[1], ds[5]);
+    compute_delta_step3_win5(&deltas[0], ds[0], ds[0]);
+    compute_delta_step3_win5(&deltas[1], ds[0], ds[1]);
+    compute_delta_step3_win5(&deltas[2], ds[0], ds[2]);
+    compute_delta_step3_win5(&deltas[3], ds[0], ds[3]);
+    compute_delta_step3_win5(&deltas[4], ds[0], ds[4]);
+    compute_delta_step3_win5(&deltas[0], ds[1], ds[1]);
+    compute_delta_step3_win5(&deltas[1], ds[1], ds[2]);
+    compute_delta_step3_win5(&deltas[2], ds[1], ds[3]);
+    compute_delta_step3_win5(&deltas[3], ds[1], ds[4]);
+    compute_delta_step3_win5(&deltas[4], ds[1], ds[5]);
 
     ds[0] = ds[2];
     ds[1] = ds[3];
@@ -455,36 +454,17 @@
 
     d += 2 * d_stride;
     y -= 2;
-  } while (y);
-}
+  } while (y > 1);
 
-static inline void step3_win5_oneline_neon(const int16_t **const d,
-                                           const int32_t d_stride,
-                                           const int32_t width,
-                                           const int32_t height, int16x8_t *ds,
-                                           int32x4_t *deltas) {
-  int32_t y = height;
-  do {
-    ds[8] = vld1q_s16(*d);
-    ds[9] = vld1q_s16(*d + width);
+  if (y) {
+    ds[4] = load_unaligned_s16_4x2(d + 0 * d_stride, width);
 
-    compute_delta_step3(&deltas[0], &deltas[4], ds[0], ds[1], ds[0], ds[1]);
-    compute_delta_step3(&deltas[1], &deltas[5], ds[0], ds[1], ds[2], ds[3]);
-    compute_delta_step3(&deltas[2], &deltas[6], ds[0], ds[1], ds[4], ds[5]);
-    compute_delta_step3(&deltas[3], &deltas[7], ds[0], ds[1], ds[6], ds[7]);
-    compute_delta_step3(&deltas[8], &deltas[12], ds[0], ds[1], ds[8], ds[9]);
-
-    ds[0] = ds[2];
-    ds[1] = ds[3];
-    ds[2] = ds[4];
-    ds[3] = ds[5];
-    ds[4] = ds[6];
-    ds[5] = ds[7];
-    ds[6] = ds[8];
-    ds[7] = ds[9];
-
-    *d += d_stride;
-  } while (--y);
+    compute_delta_step3_win5(&deltas[0], ds[0], ds[0]);
+    compute_delta_step3_win5(&deltas[1], ds[0], ds[1]);
+    compute_delta_step3_win5(&deltas[2], ds[0], ds[2]);
+    compute_delta_step3_win5(&deltas[3], ds[0], ds[3]);
+    compute_delta_step3_win5(&deltas[4], ds[0], ds[4]);
+  }
 }
 
 static inline void derive_triangle_win5_neon(const int16x8_t *d_is,
diff --git a/av1/encoder/arm/pickrst_sve.h b/av1/encoder/arm/pickrst_sve.h
index 94c0375..81ff93c 100644
--- a/av1/encoder/arm/pickrst_sve.h
+++ b/av1/encoder/arm/pickrst_sve.h
@@ -527,64 +527,34 @@
   {
     const int16_t *d_t = d;
 
-    if (height % 2) {
-      int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
-      int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
-      int16x8_t ds[WIENER_WIN * 2];
+    int32x4_t deltas[WIENER_WIN_CHROMA] = { vdupq_n_s32(0) };
+    int16x8_t ds[WIENER_WIN_CHROMA + 1];
 
-      load_s16_8x4(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6]);
-      load_s16_8x4(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7]);
-      d_t += 4 * d_stride;
+    ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
+    ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
+    ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
+    ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
 
-      step3_win5_oneline_neon(&d_t, d_stride, width, height, ds, deltas);
-      transpose_arrays_s32_8x8(deltas, deltas_tr);
+    step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
 
-      update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
-                          deltas_tr[0], vgetq_lane_s32(deltas_tr[4], 0),
-                          H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
+    transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2],
+                                    &deltas[3]);
 
-      update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
-                          deltas_tr[1], vgetq_lane_s32(deltas_tr[5], 0),
-                          H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
+    update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
+                        deltas[0], vgetq_lane_s32(deltas[4], 0),
+                        H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
 
-      update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
-                          deltas_tr[2], vgetq_lane_s32(deltas_tr[6], 0),
-                          H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
+    update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
+                        deltas[1], vgetq_lane_s32(deltas[4], 1),
+                        H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
 
-      update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
-                          deltas_tr[3], vgetq_lane_s32(deltas_tr[7], 0),
-                          H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
+    update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
+                        deltas[2], vgetq_lane_s32(deltas[4], 2),
+                        H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
 
-    } else {
-      int32x4_t deltas[WIENER_WIN_CHROMA * 2] = { vdupq_n_s32(0) };
-      int16x8_t ds[WIENER_WIN_CHROMA * 2];
-
-      ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
-      ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
-      ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
-      ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
-
-      step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
-
-      transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2],
-                                      &deltas[3]);
-
-      update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
-                          deltas[0], vgetq_lane_s32(deltas[4], 0),
-                          H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
-
-      update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
-                          deltas[1], vgetq_lane_s32(deltas[4], 1),
-                          H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
-
-      update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
-                          deltas[2], vgetq_lane_s32(deltas[4], 2),
-                          H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
-
-      update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
-                          deltas[3], vgetq_lane_s32(deltas[4], 3),
-                          H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
-    }
+    update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
+                        deltas[3], vgetq_lane_s32(deltas[4], 3),
+                        H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
   }
 
   // Step 4: Derive the top and left edge of each square. No square in top and