Use load_unaligned mem_neon.h helpers in SAD and SAD4D
Use the load_unaligned helper functions in mem_neon.h to load strided
sequences of 4 bytes where alignment is not guaranteed in the Neon
SAD and SAD4D paths.
Change-Id: I7332e412f8b93879b64d2ee9d944ff5962fb5745
diff --git a/aom_dsp/arm/sad4d_neon.c b/aom_dsp/arm/sad4d_neon.c
index e1eccc3..dc9ac20 100644
--- a/aom_dsp/arm/sad4d_neon.c
+++ b/aom_dsp/arm/sad4d_neon.c
@@ -15,6 +15,7 @@
#include "config/aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/sum_neon.h"
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
@@ -416,35 +417,16 @@
int i = 0;
do {
- uint32x2_t s, r0, r1, r2, r3;
- uint32_t s_lo, s_hi, r0_lo, r0_hi, r1_lo, r1_hi, r2_lo, r2_hi, r3_lo, r3_hi;
+ uint8x8_t s = load_unaligned_u8(src + i * src_stride, src_stride);
+ uint8x8_t r0 = load_unaligned_u8(ref[0] + i * ref_stride, ref_stride);
+ uint8x8_t r1 = load_unaligned_u8(ref[1] + i * ref_stride, ref_stride);
+ uint8x8_t r2 = load_unaligned_u8(ref[2] + i * ref_stride, ref_stride);
+ uint8x8_t r3 = load_unaligned_u8(ref[3] + i * ref_stride, ref_stride);
- memcpy(&s_lo, src + i * src_stride, 4);
- memcpy(&r0_lo, ref[0] + i * ref_stride, 4);
- memcpy(&r1_lo, ref[1] + i * ref_stride, 4);
- memcpy(&r2_lo, ref[2] + i * ref_stride, 4);
- memcpy(&r3_lo, ref[3] + i * ref_stride, 4);
- s = vdup_n_u32(s_lo);
- r0 = vdup_n_u32(r0_lo);
- r1 = vdup_n_u32(r1_lo);
- r2 = vdup_n_u32(r2_lo);
- r3 = vdup_n_u32(r3_lo);
-
- memcpy(&s_hi, src + (i + 1) * src_stride, 4);
- memcpy(&r0_hi, ref[0] + (i + 1) * ref_stride, 4);
- memcpy(&r1_hi, ref[1] + (i + 1) * ref_stride, 4);
- memcpy(&r2_hi, ref[2] + (i + 1) * ref_stride, 4);
- memcpy(&r3_hi, ref[3] + (i + 1) * ref_stride, 4);
- s = vset_lane_u32(s_hi, s, 1);
- r0 = vset_lane_u32(r0_hi, r0, 1);
- r1 = vset_lane_u32(r1_hi, r1, 1);
- r2 = vset_lane_u32(r2_hi, r2, 1);
- r3 = vset_lane_u32(r3_hi, r3, 1);
-
- sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r0), &sum[0]);
- sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r1), &sum[1]);
- sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r2), &sum[2]);
- sad8_neon(vreinterpret_u8_u32(s), vreinterpret_u8_u32(r3), &sum[3]);
+ sad8_neon(s, r0, &sum[0]);
+ sad8_neon(s, r1, &sum[1]);
+ sad8_neon(s, r2, &sum[2]);
+ sad8_neon(s, r3, &sum[3]);
i += 2;
} while (i < h);
diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
index 5ba7f10..6a22289 100644
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c
@@ -10,9 +10,12 @@
*/
#include <arm_neon.h>
+
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
+
#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/sum_neon.h"
#if defined(__ARM_FEATURE_DOTPROD)
@@ -289,24 +292,13 @@
int i = h / 2;
do {
- uint32x2_t s, r;
- uint32_t s0, s1, r0, r1;
+ uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
- memcpy(&s0, src_ptr, 4);
- memcpy(&r0, ref_ptr, 4);
- s = vdup_n_u32(s0);
- r = vdup_n_u32(r0);
- src_ptr += src_stride;
- ref_ptr += ref_stride;
+ sum = vabal_u8(sum, s, r);
- memcpy(&s1, src_ptr, 4);
- memcpy(&r1, ref_ptr, 4);
- s = vset_lane_u32(s1, s, 1);
- r = vset_lane_u32(r1, r, 1);
- src_ptr += src_stride;
- ref_ptr += ref_stride;
-
- sum = vabal_u8(sum, vreinterpret_u8_u32(s), vreinterpret_u8_u32(r));
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
} while (--i != 0);
return horizontal_add_u16x8(sum);
@@ -732,28 +724,15 @@
int i = h / 2;
do {
- uint32x2_t s, r;
- uint32_t s0, s1, r0, r1;
- uint8x8_t p, avg;
+ uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+ uint8x8_t p = vld1_u8(second_pred);
- memcpy(&s0, src_ptr, 4);
- memcpy(&r0, ref_ptr, 4);
- s = vdup_n_u32(s0);
- r = vdup_n_u32(r0);
- src_ptr += src_stride;
- ref_ptr += ref_stride;
+ uint8x8_t avg = vrhadd_u8(r, p);
+ sum = vabal_u8(sum, s, avg);
- memcpy(&s1, src_ptr, 4);
- memcpy(&r1, ref_ptr, 4);
- s = vset_lane_u32(s1, s, 1);
- r = vset_lane_u32(r1, r, 1);
- src_ptr += src_stride;
- ref_ptr += ref_stride;
-
- p = vld1_u8(second_pred);
- avg = vrhadd_u8(vreinterpret_u8_u32(r), p);
-
- sum = vabal_u8(sum, vreinterpret_u8_u32(s), avg);
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
second_pred += 8;
} while (--i != 0);