Minor optimizations for x86 pro_row/col functions Replace unnecessary saturating add with normal adds. Throughput for saturating arithmetic is lower for some processors (incl. modern ones). Replace 16 bit extract with _mm_cvt_si128_si32 (movd). Microbenchmarks were within margin of error. Change-Id: I3a6a26339b71779f5156e5af6b94cf29932eb87c
diff --git a/aom_dsp/x86/avg_intrin_avx2.c b/aom_dsp/x86/avg_intrin_avx2.c index e4edb12..6f374ee 100644 --- a/aom_dsp/x86/avg_intrin_avx2.c +++ b/aom_dsp/x86/avg_intrin_avx2.c
@@ -572,15 +572,15 @@ __m256i src_line = _mm256_loadu_si256((const __m256i *)ref_tmp); __m256i t0 = _mm256_unpacklo_epi8(src_line, zero); __m256i t1 = _mm256_unpackhi_epi8(src_line, zero); - s0 = _mm256_adds_epu16(s0, t0); - s1 = _mm256_adds_epu16(s1, t1); + s0 = _mm256_add_epi16(s0, t0); + s1 = _mm256_add_epi16(s1, t1); ref_tmp += ref_stride; src_line = _mm256_loadu_si256((const __m256i *)ref_tmp); t0 = _mm256_unpacklo_epi8(src_line, zero); t1 = _mm256_unpackhi_epi8(src_line, zero); - s0 = _mm256_adds_epu16(s0, t0); - s1 = _mm256_adds_epu16(s1, t1); + s0 = _mm256_add_epi16(s0, t0); + s1 = _mm256_add_epi16(s1, t1); ref_tmp += ref_stride; idx += 2; } while (idx < height); @@ -616,16 +616,16 @@ const __m256i s1 = _mm256_sad_epu8(src_line1, zero); const __m256i s2 = _mm256_sad_epu8(src_line2, zero); const __m256i s3 = _mm256_sad_epu8(src_line3, zero); - const __m256i result0_256bit = _mm256_adds_epu16(s0, s1); - const __m256i result1_256bit = _mm256_adds_epu16(s2, s3); + const __m256i result0_256bit = _mm256_add_epi16(s0, s1); + const __m256i result1_256bit = _mm256_add_epi16(s2, s3); const __m256i result_256bit = - _mm256_adds_epu16(result0_256bit, result1_256bit); + _mm256_add_epi16(result0_256bit, result1_256bit); const __m128i result = - _mm_adds_epu16(_mm256_castsi256_si128(result_256bit), - _mm256_extractf128_si256(result_256bit, 1)); - __m128i result1 = _mm_adds_epu16(result, _mm_srli_si128(result, 8)); - vbuf[ht] = _mm_extract_epi16(result1, 0) >> norm_factor; + _mm_add_epi16(_mm256_castsi256_si128(result_256bit), + _mm256_extractf128_si256(result_256bit, 1)); + __m128i result1 = _mm_add_epi16(result, _mm_srli_si128(result, 8)); + vbuf[ht] = _mm_cvtsi128_si32(result1) >> norm_factor; ref += ref_stride; } } else if (width == 64) { @@ -635,13 +635,13 @@ const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(ref + 32)); const __m256i s1 = _mm256_sad_epu8(src_line0, zero); const __m256i s2 = _mm256_sad_epu8(src_line1, zero); - const __m256i result_256bit = _mm256_adds_epu16(s1, s2); + const __m256i result_256bit = _mm256_add_epi16(s1, s2); const __m128i result = - _mm_adds_epu16(_mm256_castsi256_si128(result_256bit), - _mm256_extractf128_si256(result_256bit, 1)); - __m128i result1 = _mm_adds_epu16(result, _mm_srli_si128(result, 8)); - vbuf[ht] = _mm_extract_epi16(result1, 0) >> norm_factor; + _mm_add_epi16(_mm256_castsi256_si128(result_256bit), + _mm256_extractf128_si256(result_256bit, 1)); + __m128i result1 = _mm_add_epi16(result, _mm_srli_si128(result, 8)); + vbuf[ht] = _mm_cvtsi128_si32(result1) >> norm_factor; ref += ref_stride; } } else if (width == 32) { @@ -654,15 +654,15 @@ const __m256i s0 = _mm256_sad_epu8(src_line0, zero); const __m256i s1 = _mm256_sad_epu8(src_line1, zero); - __m128i result0 = _mm_adds_epu16(_mm256_castsi256_si128(s0), - _mm256_extractf128_si256(s0, 1)); - __m128i result1 = _mm_adds_epu16(_mm256_castsi256_si128(s1), - _mm256_extractf128_si256(s1, 1)); - __m128i result2 = _mm_adds_epu16(result0, _mm_srli_si128(result0, 8)); - __m128i result3 = _mm_adds_epu16(result1, _mm_srli_si128(result1, 8)); + __m128i result0 = _mm_add_epi16(_mm256_castsi256_si128(s0), + _mm256_extractf128_si256(s0, 1)); + __m128i result1 = _mm_add_epi16(_mm256_castsi256_si128(s1), + _mm256_extractf128_si256(s1, 1)); + __m128i result2 = _mm_add_epi16(result0, _mm_srli_si128(result0, 8)); + __m128i result3 = _mm_add_epi16(result1, _mm_srli_si128(result1, 8)); - vbuf[ht] = _mm_extract_epi16(result2, 0) >> norm_factor; - vbuf[ht + 1] = _mm_extract_epi16(result3, 0) >> norm_factor; + vbuf[ht] = _mm_cvtsi128_si32(result2) >> norm_factor; + vbuf[ht + 1] = _mm_cvtsi128_si32(result3) >> norm_factor; ref += (2 * ref_stride); } } else if (width == 16) {
diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c index 8e89555..9657ecc 100644 --- a/aom_dsp/x86/avg_intrin_sse2.c +++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -643,15 +643,15 @@ __m128i src_line = _mm_loadu_si128((const __m128i *)ref_tmp); __m128i t0 = _mm_unpacklo_epi8(src_line, zero); __m128i t1 = _mm_unpackhi_epi8(src_line, zero); - s0 = _mm_adds_epu16(s0, t0); - s1 = _mm_adds_epu16(s1, t1); + s0 = _mm_add_epi16(s0, t0); + s1 = _mm_add_epi16(s1, t1); ref_tmp += ref_stride; src_line = _mm_loadu_si128((const __m128i *)ref_tmp); t0 = _mm_unpacklo_epi8(src_line, zero); t1 = _mm_unpackhi_epi8(src_line, zero); - s0 = _mm_adds_epu16(s0, t0); - s1 = _mm_adds_epu16(s1, t1); + s0 = _mm_add_epi16(s0, t0); + s1 = _mm_add_epi16(s1, t1); ref_tmp += ref_stride; idx += 2; } while (idx < height); @@ -677,12 +677,12 @@ for (int i = 0; i < width; i += 16) { src_line = _mm_loadu_si128((const __m128i *)ref_tmp); s1 = _mm_sad_epu8(src_line, zero); - s0 = _mm_adds_epu16(s0, s1); + s0 = _mm_add_epi16(s0, s1); ref_tmp += 16; } s1 = _mm_srli_si128(s0, 8); - s0 = _mm_adds_epu16(s0, s1); - vbuf[ht] = _mm_extract_epi16(s0, 0) >> norm_factor; + s0 = _mm_add_epi16(s0, s1); + vbuf[ht] = _mm_cvtsi128_si32(s0) >> norm_factor; } }