Simplify AArch64 SIMD implementation of svt_compute_stats_win5_*
Delete unnecessary multiplications in step 3 of the win5 helper function
and simplify it to only have one case.
This is a port of 234f4afffddf08739dbd04bebb42c05821b9015f in SVT-AV1.
Change-Id: I66d59494e6385c249e2b089573616fcd5f93ce1e
diff --git a/av1/encoder/arm/highbd_pickrst_neon.c b/av1/encoder/arm/highbd_pickrst_neon.c
index bb85a4b..499eb58 100644
--- a/av1/encoder/arm/highbd_pickrst_neon.c
+++ b/av1/encoder/arm/highbd_pickrst_neon.c
@@ -463,64 +463,34 @@
{
const int16_t *d_t = d;
- if (height % 2) {
- int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
- int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
- int16x8_t ds[WIENER_WIN * 2];
+ int32x4_t deltas[WIENER_WIN_CHROMA] = { vdupq_n_s32(0) };
+ int16x8_t ds[WIENER_WIN_CHROMA + 1];
- load_s16_8x4(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6]);
- load_s16_8x4(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7]);
- d_t += 4 * d_stride;
+ ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
+ ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
+ ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
+ ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
- step3_win5_oneline_neon(&d_t, d_stride, width, height, ds, deltas);
- transpose_arrays_s32_8x8(deltas, deltas_tr);
+ step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
- update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
- deltas_tr[0], vgetq_lane_s32(deltas_tr[4], 0),
- H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
+ transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2],
+ &deltas[3]);
- update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
- deltas_tr[1], vgetq_lane_s32(deltas_tr[5], 0),
- H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
+ update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
+ deltas[0], vgetq_lane_s32(deltas[4], 0),
+ H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
- update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
- deltas_tr[2], vgetq_lane_s32(deltas_tr[6], 0),
- H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
+ update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
+ deltas[1], vgetq_lane_s32(deltas[4], 1),
+ H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
- update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
- deltas_tr[3], vgetq_lane_s32(deltas_tr[7], 0),
- H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
+ update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
+ deltas[2], vgetq_lane_s32(deltas[4], 2),
+ H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
- } else {
- int32x4_t deltas[WIENER_WIN_CHROMA * 2] = { vdupq_n_s32(0) };
- int16x8_t ds[WIENER_WIN_CHROMA * 2];
-
- ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
- ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
- ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
- ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
-
- step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
-
- transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2],
- &deltas[3]);
-
- update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
- deltas[0], vgetq_lane_s32(deltas[4], 0),
- H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
-
- update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
- deltas[1], vgetq_lane_s32(deltas[4], 1),
- H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
-
- update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
- deltas[2], vgetq_lane_s32(deltas[4], 2),
- H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
-
- update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
- deltas[3], vgetq_lane_s32(deltas[4], 3),
- H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
- }
+ update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
+ deltas[3], vgetq_lane_s32(deltas[4], 3),
+ H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
}
// Step 4: Derive the top and left edge of each square. No square in top and
diff --git a/av1/encoder/arm/pickrst_neon.c b/av1/encoder/arm/pickrst_neon.c
index f83e693..70ee3b6 100644
--- a/av1/encoder/arm/pickrst_neon.c
+++ b/av1/encoder/arm/pickrst_neon.c
@@ -1027,64 +1027,34 @@
{
const int16_t *d_t = d;
- if (height % 2) {
- int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
- int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
- int16x8_t ds[WIENER_WIN * 2];
+ int32x4_t deltas[WIENER_WIN_CHROMA] = { vdupq_n_s32(0) };
+ int16x8_t ds[WIENER_WIN_CHROMA + 1];
- load_s16_8x4(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6]);
- load_s16_8x4(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7]);
- d_t += 4 * d_stride;
+ ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
+ ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
+ ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
+ ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
- step3_win5_oneline_neon(&d_t, d_stride, width, height, ds, deltas);
- transpose_arrays_s32_8x8(deltas, deltas_tr);
+ step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
- update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
- deltas_tr[0], vgetq_lane_s32(deltas_tr[4], 0),
- H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
+ transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2],
+ &deltas[3]);
- update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
- deltas_tr[1], vgetq_lane_s32(deltas_tr[5], 0),
- H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
+ update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
+ deltas[0], vgetq_lane_s32(deltas[4], 0),
+ H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
- update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
- deltas_tr[2], vgetq_lane_s32(deltas_tr[6], 0),
- H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
+ update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
+ deltas[1], vgetq_lane_s32(deltas[4], 1),
+ H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
- update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
- deltas_tr[3], vgetq_lane_s32(deltas_tr[7], 0),
- H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
+ update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
+ deltas[2], vgetq_lane_s32(deltas[4], 2),
+ H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
- } else {
- int32x4_t deltas[WIENER_WIN_CHROMA * 2] = { vdupq_n_s32(0) };
- int16x8_t ds[WIENER_WIN_CHROMA * 2];
-
- ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
- ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
- ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
- ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
-
- step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
-
- transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2],
- &deltas[3]);
-
- update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
- deltas[0], vgetq_lane_s32(deltas[4], 0),
- H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
-
- update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
- deltas[1], vgetq_lane_s32(deltas[4], 1),
- H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
-
- update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
- deltas[2], vgetq_lane_s32(deltas[4], 2),
- H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
-
- update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
- deltas[3], vgetq_lane_s32(deltas[4], 3),
- H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
- }
+ update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
+ deltas[3], vgetq_lane_s32(deltas[4], 3),
+ H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
}
// Step 4: Derive the top and left edge of each square. No square in top and
diff --git a/av1/encoder/arm/pickrst_neon.h b/av1/encoder/arm/pickrst_neon.h
index 356c42d..2eeece8 100644
--- a/av1/encoder/arm/pickrst_neon.h
+++ b/av1/encoder/arm/pickrst_neon.h
@@ -422,9 +422,8 @@
dst[4] = src[4] + delta4;
}
-static inline void compute_delta_step3_two_lines(int32x4_t *sum,
- const int16x8_t src,
- const int16x8_t dgd) {
+static inline void compute_delta_step3_win5(int32x4_t *sum, const int16x8_t src,
+ const int16x8_t dgd) {
*sum = vmlsl_s16(*sum, vget_low_s16(src), vget_low_s16(dgd));
*sum = vmlal_s16(*sum, vget_high_s16(src), vget_high_s16(dgd));
}
@@ -437,16 +436,16 @@
ds[4] = load_unaligned_s16_4x2(d + 0 * d_stride, width);
ds[5] = load_unaligned_s16_4x2(d + 1 * d_stride, width);
- compute_delta_step3_two_lines(&deltas[0], ds[0], ds[0]);
- compute_delta_step3_two_lines(&deltas[1], ds[0], ds[1]);
- compute_delta_step3_two_lines(&deltas[2], ds[0], ds[2]);
- compute_delta_step3_two_lines(&deltas[3], ds[0], ds[3]);
- compute_delta_step3_two_lines(&deltas[4], ds[0], ds[4]);
- compute_delta_step3_two_lines(&deltas[0], ds[1], ds[1]);
- compute_delta_step3_two_lines(&deltas[1], ds[1], ds[2]);
- compute_delta_step3_two_lines(&deltas[2], ds[1], ds[3]);
- compute_delta_step3_two_lines(&deltas[3], ds[1], ds[4]);
- compute_delta_step3_two_lines(&deltas[4], ds[1], ds[5]);
+ compute_delta_step3_win5(&deltas[0], ds[0], ds[0]);
+ compute_delta_step3_win5(&deltas[1], ds[0], ds[1]);
+ compute_delta_step3_win5(&deltas[2], ds[0], ds[2]);
+ compute_delta_step3_win5(&deltas[3], ds[0], ds[3]);
+ compute_delta_step3_win5(&deltas[4], ds[0], ds[4]);
+ compute_delta_step3_win5(&deltas[0], ds[1], ds[1]);
+ compute_delta_step3_win5(&deltas[1], ds[1], ds[2]);
+ compute_delta_step3_win5(&deltas[2], ds[1], ds[3]);
+ compute_delta_step3_win5(&deltas[3], ds[1], ds[4]);
+ compute_delta_step3_win5(&deltas[4], ds[1], ds[5]);
ds[0] = ds[2];
ds[1] = ds[3];
@@ -455,36 +454,17 @@
d += 2 * d_stride;
y -= 2;
- } while (y);
-}
+ } while (y > 1);
-static inline void step3_win5_oneline_neon(const int16_t **const d,
- const int32_t d_stride,
- const int32_t width,
- const int32_t height, int16x8_t *ds,
- int32x4_t *deltas) {
- int32_t y = height;
- do {
- ds[8] = vld1q_s16(*d);
- ds[9] = vld1q_s16(*d + width);
+ if (y) {
+ ds[4] = load_unaligned_s16_4x2(d + 0 * d_stride, width);
- compute_delta_step3(&deltas[0], &deltas[4], ds[0], ds[1], ds[0], ds[1]);
- compute_delta_step3(&deltas[1], &deltas[5], ds[0], ds[1], ds[2], ds[3]);
- compute_delta_step3(&deltas[2], &deltas[6], ds[0], ds[1], ds[4], ds[5]);
- compute_delta_step3(&deltas[3], &deltas[7], ds[0], ds[1], ds[6], ds[7]);
- compute_delta_step3(&deltas[8], &deltas[12], ds[0], ds[1], ds[8], ds[9]);
-
- ds[0] = ds[2];
- ds[1] = ds[3];
- ds[2] = ds[4];
- ds[3] = ds[5];
- ds[4] = ds[6];
- ds[5] = ds[7];
- ds[6] = ds[8];
- ds[7] = ds[9];
-
- *d += d_stride;
- } while (--y);
+ compute_delta_step3_win5(&deltas[0], ds[0], ds[0]);
+ compute_delta_step3_win5(&deltas[1], ds[0], ds[1]);
+ compute_delta_step3_win5(&deltas[2], ds[0], ds[2]);
+ compute_delta_step3_win5(&deltas[3], ds[0], ds[3]);
+ compute_delta_step3_win5(&deltas[4], ds[0], ds[4]);
+ }
}
static inline void derive_triangle_win5_neon(const int16x8_t *d_is,
diff --git a/av1/encoder/arm/pickrst_sve.h b/av1/encoder/arm/pickrst_sve.h
index 94c0375..81ff93c 100644
--- a/av1/encoder/arm/pickrst_sve.h
+++ b/av1/encoder/arm/pickrst_sve.h
@@ -527,64 +527,34 @@
{
const int16_t *d_t = d;
- if (height % 2) {
- int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
- int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
- int16x8_t ds[WIENER_WIN * 2];
+ int32x4_t deltas[WIENER_WIN_CHROMA] = { vdupq_n_s32(0) };
+ int16x8_t ds[WIENER_WIN_CHROMA + 1];
- load_s16_8x4(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6]);
- load_s16_8x4(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7]);
- d_t += 4 * d_stride;
+ ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
+ ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
+ ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
+ ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
- step3_win5_oneline_neon(&d_t, d_stride, width, height, ds, deltas);
- transpose_arrays_s32_8x8(deltas, deltas_tr);
+ step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
- update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
- deltas_tr[0], vgetq_lane_s32(deltas_tr[4], 0),
- H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
+ transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2],
+ &deltas[3]);
- update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
- deltas_tr[1], vgetq_lane_s32(deltas_tr[5], 0),
- H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
+ update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
+ deltas[0], vgetq_lane_s32(deltas[4], 0),
+ H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
- update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
- deltas_tr[2], vgetq_lane_s32(deltas_tr[6], 0),
- H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
+ update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
+ deltas[1], vgetq_lane_s32(deltas[4], 1),
+ H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
- update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
- deltas_tr[3], vgetq_lane_s32(deltas_tr[7], 0),
- H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
+ update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
+ deltas[2], vgetq_lane_s32(deltas[4], 2),
+ H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
- } else {
- int32x4_t deltas[WIENER_WIN_CHROMA * 2] = { vdupq_n_s32(0) };
- int16x8_t ds[WIENER_WIN_CHROMA * 2];
-
- ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
- ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
- ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
- ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
-
- step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
-
- transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2],
- &deltas[3]);
-
- update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
- deltas[0], vgetq_lane_s32(deltas[4], 0),
- H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
-
- update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
- deltas[1], vgetq_lane_s32(deltas[4], 1),
- H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
-
- update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
- deltas[2], vgetq_lane_s32(deltas[4], 2),
- H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
-
- update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
- deltas[3], vgetq_lane_s32(deltas[4], 3),
- H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
- }
+ update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
+ deltas[3], vgetq_lane_s32(deltas[4], 3),
+ H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
}
// Step 4: Derive the top and left edge of each square. No square in top and