Improve aom_satd SSE2 optimization
At the function level, this CL further sped up the SSE2 function by:
26% (satd_size = 16)
38% (satd_size = 64)
55% (satd_size = 256)
55% (satd_size = 1024)
Change-Id: Ica2559c44a13baf62d113a1688a1bf5a09c05761
diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index 67ea85b..a52abd0 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -460,17 +460,21 @@
int aom_satd_sse2(const tran_low_t *coeff, int length) {
int i;
const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
__m128i accum = zero;
- for (i = 0; i < length; i += 8) {
- const __m128i src_line = load_tran_low(coeff);
- const __m128i inv = _mm_sub_epi16(zero, src_line);
- const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line)
- const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
- const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
- const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
- accum = _mm_add_epi32(accum, sum);
- coeff += 8;
+ for (i = 0; i < length; i += 16) {
+ const __m128i src_line0 = load_tran_low(coeff);
+ const __m128i src_line1 = load_tran_low(coeff + 8);
+ const __m128i inv0 = _mm_sub_epi16(zero, src_line0);
+ const __m128i inv1 = _mm_sub_epi16(zero, src_line1);
+ const __m128i abs0 = _mm_max_epi16(src_line0, inv0); // abs(src_line)
+ const __m128i abs1 = _mm_max_epi16(src_line1, inv1); // abs(src_line)
+ const __m128i sum0 = _mm_madd_epi16(abs0, one);
+ const __m128i sum1 = _mm_madd_epi16(abs1, one);
+ accum = _mm_add_epi32(accum, sum0);
+ accum = _mm_add_epi32(accum, sum1);
+ coeff += 16;
}
{ // cascading summation of accum