rtc: Add AVX2 variant for functions related to motion search
This CL Adds AVX2 for aom_int_pro_row() and aom_int_pro_col()
functions. Also, refactored the existing code to make it AVX2
friendly.
The overall encode time reduction for RT preset is listed below
Encode_time
cpu Reduction(%)
7 1.150
8 1.331
Change-Id: Idebe6dd72933674148bcc41785f9b42d93dc2f11
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index d672d77..94f15c4 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1188,11 +1188,11 @@
add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
}
- add_proto qw/void aom_int_pro_row/, "int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height";
- specialize qw/aom_int_pro_row sse2 neon/;
+ add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor";
+ specialize qw/aom_int_pro_row avx2 sse2 neon/;
- add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, const int width";
- specialize qw/aom_int_pro_col sse2 neon/;
+ add_proto qw/void aom_int_pro_col/, "int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor";
+ specialize qw/aom_int_pro_col avx2 sse2 neon/;
add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
specialize qw/aom_vector_var sse4_1 neon/;
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index 593807b..2959c97 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -89,58 +89,55 @@
return horizontal_add_s32x4(accum);
}
-void aom_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref,
- const int ref_stride, const int height) {
- int i;
+void aom_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor) {
const uint8_t *idx = ref;
- uint16x8_t vec0 = vdupq_n_u16(0);
- uint16x8_t vec1 = vec0;
- uint8x16_t tmp;
+ const uint16x8_t zero = vdupq_n_u16(0);
+ const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
- for (i = 0; i < height; ++i) {
- tmp = vld1q_u8(idx);
- idx += ref_stride;
- vec0 = vaddw_u8(vec0, vget_low_u8(tmp));
- vec1 = vaddw_u8(vec1, vget_high_u8(tmp));
+ for (int wd = 0; wd < width; wd += 16) {
+ uint16x8_t vec0 = zero;
+ uint16x8_t vec1 = zero;
+ idx = ref + wd;
+ for (int ht = 0; ht < height; ++ht) {
+ const uint8x16_t tmp = vld1q_u8(idx);
+ idx += ref_stride;
+ vec0 = vaddw_u8(vec0, vget_low_u8(tmp));
+ vec1 = vaddw_u8(vec1, vget_high_u8(tmp));
+ }
+
+ const int16x8_t result0 =
+ vshlq_s16(vreinterpretq_s16_u16(vec0), neg_norm_factor);
+ const int16x8_t result1 =
+ vshlq_s16(vreinterpretq_s16_u16(vec1), neg_norm_factor);
+
+ vst1q_s16(hbuf + wd, result0);
+ vst1q_s16(hbuf + wd + 8, result1);
}
-
- if (128 == height) {
- vec0 = vshrq_n_u16(vec0, 6);
- vec1 = vshrq_n_u16(vec1, 6);
- } else if (64 == height) {
- vec0 = vshrq_n_u16(vec0, 5);
- vec1 = vshrq_n_u16(vec1, 5);
- } else if (32 == height) {
- vec0 = vshrq_n_u16(vec0, 4);
- vec1 = vshrq_n_u16(vec1, 4);
- } else if (16 == height) {
- vec0 = vshrq_n_u16(vec0, 3);
- vec1 = vshrq_n_u16(vec1, 3);
- }
-
- vst1q_s16(hbuf, vreinterpretq_s16_u16(vec0));
- hbuf += 8;
- vst1q_s16(hbuf, vreinterpretq_s16_u16(vec1));
}
-int16_t aom_int_pro_col_neon(const uint8_t *ref, const int width) {
- const uint8_t *idx;
- uint16x8_t sum = vdupq_n_u16(0);
-
- for (idx = ref; idx < (ref + width); idx += 16) {
- uint8x16_t vec = vld1q_u8(idx);
- sum = vaddq_u16(sum, vpaddlq_u8(vec));
- }
+void aom_int_pro_col_neon(int16_t *vbuf, const uint8_t *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor) {
+ for (int ht = 0; ht < height; ++ht) {
+ uint16x8_t sum = vdupq_n_u16(0);
+ for (int wd = 0; wd < width; wd += 16) {
+ const uint8x16_t vec = vld1q_u8(ref + wd);
+ sum = vaddq_u16(sum, vpaddlq_u8(vec));
+ }
#if defined(__aarch64__)
- return (int16_t)vaddvq_u16(sum);
+ vbuf[ht] = ((int16_t)vaddvq_u16(sum)) >> norm_factor;
#else
- const uint32x4_t a = vpaddlq_u16(sum);
- const uint64x2_t b = vpaddlq_u32(a);
- const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
- vreinterpret_u32_u64(vget_high_u64(b)));
- return (int16_t)vget_lane_u32(c, 0);
+ const uint32x4_t a = vpaddlq_u16(sum);
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+ vbuf[ht] = ((int16_t)vget_lane_u32(c, 0)) >> norm_factor;
#endif
+ ref += ref_stride;
+ }
}
// coeff: 16 bits, dynamic range [-32640, 32640].
diff --git a/aom_dsp/avg.c b/aom_dsp/avg.c
index a3821e6..920f08d 100644
--- a/aom_dsp/avg.c
+++ b/aom_dsp/avg.c
@@ -507,29 +507,29 @@
// Integer projection onto row vectors.
// height: value range {16, 32, 64, 128}.
-void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
- const int ref_stride, const int height) {
- int idx;
- const int norm_factor = height >> 1;
+void aom_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
+ const int width, const int height, int norm_factor) {
assert(height >= 2);
- for (idx = 0; idx < 16; ++idx) {
- int i;
+ for (int idx = 0; idx < width; ++idx) {
hbuf[idx] = 0;
// hbuf[idx]: 14 bit, dynamic range [0, 32640].
- for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
+ for (int i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
// hbuf[idx]: 9 bit, dynamic range [0, 1020].
- hbuf[idx] /= norm_factor;
+ hbuf[idx] >>= norm_factor;
++ref;
}
}
// width: value range {16, 32, 64, 128}.
-int16_t aom_int_pro_col_c(const uint8_t *ref, const int width) {
- int idx;
- int16_t sum = 0;
- // sum: 14 bit, dynamic range [0, 32640]
- for (idx = 0; idx < width; ++idx) sum += ref[idx];
- return sum;
+void aom_int_pro_col_c(int16_t *vbuf, const uint8_t *ref, const int ref_stride,
+ const int width, const int height, int norm_factor) {
+ for (int ht = 0; ht < height; ++ht) {
+ int16_t sum = 0;
+ // sum: 14 bit, dynamic range [0, 32640]
+ for (int idx = 0; idx < width; ++idx) sum += ref[idx];
+ vbuf[ht] = sum >> norm_factor;
+ ref += ref_stride;
+ }
}
// ref: [0 - 510]
diff --git a/aom_dsp/x86/avg_intrin_avx2.c b/aom_dsp/x86/avg_intrin_avx2.c
index 6c8db3a..9bfc06b 100644
--- a/aom_dsp/x86/avg_intrin_avx2.c
+++ b/aom_dsp/x86/avg_intrin_avx2.c
@@ -552,3 +552,120 @@
avg[2] = _mm_extract_epi16(_mm256_castsi256_si128(result_0), 4);
avg[3] = _mm_extract_epi16(_mm256_extracti128_si256(result_0, 1), 4);
}
+
+void aom_int_pro_row_avx2(int16_t *hbuf, const uint8_t *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor) {
+ // SIMD implementation assumes width and height to be multiple of 16 and 2
+ // respectively. For any odd width or height, SIMD support needs to be added.
+ assert(width % 16 == 0 && height % 2 == 0);
+
+ if (width % 32 == 0) {
+ const __m256i zero = _mm256_setzero_si256();
+ for (int wd = 0; wd < width; wd += 32) {
+ const uint8_t *ref_tmp = ref + wd;
+ int16_t *hbuf_tmp = hbuf + wd;
+ __m256i s0 = zero;
+ __m256i s1 = zero;
+ int idx = 0;
+ do {
+ __m256i src_line = _mm256_loadu_si256((const __m256i *)ref_tmp);
+ __m256i t0 = _mm256_unpacklo_epi8(src_line, zero);
+ __m256i t1 = _mm256_unpackhi_epi8(src_line, zero);
+ s0 = _mm256_adds_epu16(s0, t0);
+ s1 = _mm256_adds_epu16(s1, t1);
+ ref_tmp += ref_stride;
+
+ src_line = _mm256_loadu_si256((const __m256i *)ref_tmp);
+ t0 = _mm256_unpacklo_epi8(src_line, zero);
+ t1 = _mm256_unpackhi_epi8(src_line, zero);
+ s0 = _mm256_adds_epu16(s0, t0);
+ s1 = _mm256_adds_epu16(s1, t1);
+ ref_tmp += ref_stride;
+ idx += 2;
+ } while (idx < height);
+ s0 = _mm256_srai_epi16(s0, norm_factor);
+ s1 = _mm256_srai_epi16(s1, norm_factor);
+ _mm_storeu_si128((__m128i *)(hbuf_tmp), _mm256_castsi256_si128(s0));
+ _mm_storeu_si128((__m128i *)(hbuf_tmp + 8), _mm256_castsi256_si128(s1));
+ _mm_storeu_si128((__m128i *)(hbuf_tmp + 16),
+ _mm256_extractf128_si256(s0, 1));
+ _mm_storeu_si128((__m128i *)(hbuf_tmp + 24),
+ _mm256_extractf128_si256(s1, 1));
+ }
+ } else if (width % 16 == 0) {
+ aom_int_pro_row_sse2(hbuf, ref, ref_stride, width, height, norm_factor);
+ }
+}
+
+void aom_int_pro_col_avx2(int16_t *vbuf, const uint8_t *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor) {
+ // SIMD implementation assumes width to be multiple of 16. For any odd width,
+ // SIMD support needs to be added.
+ assert(width % 16 == 0);
+
+ if (width == 128) {
+ const __m256i zero = _mm256_setzero_si256();
+ for (int ht = 0; ht < height; ++ht) {
+ const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+ const __m256i src_line2 = _mm256_loadu_si256((const __m256i *)(ref + 64));
+ const __m256i src_line3 = _mm256_loadu_si256((const __m256i *)(ref + 96));
+ const __m256i s0 = _mm256_sad_epu8(src_line0, zero);
+ const __m256i s1 = _mm256_sad_epu8(src_line1, zero);
+ const __m256i s2 = _mm256_sad_epu8(src_line2, zero);
+ const __m256i s3 = _mm256_sad_epu8(src_line3, zero);
+ const __m256i result0_256bit = _mm256_adds_epu16(s0, s1);
+ const __m256i result1_256bit = _mm256_adds_epu16(s2, s3);
+ const __m256i result_256bit =
+ _mm256_adds_epu16(result0_256bit, result1_256bit);
+
+ const __m128i result =
+ _mm_adds_epu16(_mm256_castsi256_si128(result_256bit),
+ _mm256_extractf128_si256(result_256bit, 1));
+ __m128i result1 = _mm_adds_epu16(result, _mm_srli_si128(result, 8));
+ vbuf[ht] = _mm_extract_epi16(result1, 0) >> norm_factor;
+ ref += ref_stride;
+ }
+ } else if (width == 64) {
+ const __m256i zero = _mm256_setzero_si256();
+ for (int ht = 0; ht < height; ++ht) {
+ const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+ const __m256i s1 = _mm256_sad_epu8(src_line0, zero);
+ const __m256i s2 = _mm256_sad_epu8(src_line1, zero);
+ const __m256i result_256bit = _mm256_adds_epu16(s1, s2);
+
+ const __m128i result =
+ _mm_adds_epu16(_mm256_castsi256_si128(result_256bit),
+ _mm256_extractf128_si256(result_256bit, 1));
+ __m128i result1 = _mm_adds_epu16(result, _mm_srli_si128(result, 8));
+ vbuf[ht] = _mm_extract_epi16(result1, 0) >> norm_factor;
+ ref += ref_stride;
+ }
+ } else if (width == 32) {
+ assert(height % 2 == 0);
+ const __m256i zero = _mm256_setzero_si256();
+ for (int ht = 0; ht < height; ht += 2) {
+ const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i src_line1 =
+ _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+ const __m256i s0 = _mm256_sad_epu8(src_line0, zero);
+ const __m256i s1 = _mm256_sad_epu8(src_line1, zero);
+
+ __m128i result0 = _mm_adds_epu16(_mm256_castsi256_si128(s0),
+ _mm256_extractf128_si256(s0, 1));
+ __m128i result1 = _mm_adds_epu16(_mm256_castsi256_si128(s1),
+ _mm256_extractf128_si256(s1, 1));
+ __m128i result2 = _mm_adds_epu16(result0, _mm_srli_si128(result0, 8));
+ __m128i result3 = _mm_adds_epu16(result1, _mm_srli_si128(result1, 8));
+
+ vbuf[ht] = _mm_extract_epi16(result2, 0) >> norm_factor;
+ vbuf[ht + 1] = _mm_extract_epi16(result3, 0) >> norm_factor;
+ ref += (2 * ref_stride);
+ }
+ } else if (width == 16) {
+ aom_int_pro_col_sse2(vbuf, ref, ref_stride, width, height, norm_factor);
+ }
+}
diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index bdbd1f6..8e89555 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -625,74 +625,64 @@
return _mm_cvtsi128_si32(accum);
}
-void aom_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref,
- const int ref_stride, const int height) {
- int idx = 1;
+void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor) {
+ // SIMD implementation assumes width and height to be multiple of 16 and 2
+ // respectively. For any odd width or height, SIMD support needs to be added.
+ assert(width % 16 == 0 && height % 2 == 0);
__m128i zero = _mm_setzero_si128();
- __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
- __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
- __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
- __m128i t0, t1;
- int height_1 = height - 1;
- ref += ref_stride;
- do {
- src_line = _mm_loadu_si128((const __m128i *)ref);
- t0 = _mm_unpacklo_epi8(src_line, zero);
- t1 = _mm_unpackhi_epi8(src_line, zero);
- s0 = _mm_adds_epu16(s0, t0);
- s1 = _mm_adds_epu16(s1, t1);
- ref += ref_stride;
- src_line = _mm_loadu_si128((const __m128i *)ref);
- t0 = _mm_unpacklo_epi8(src_line, zero);
- t1 = _mm_unpackhi_epi8(src_line, zero);
- s0 = _mm_adds_epu16(s0, t0);
- s1 = _mm_adds_epu16(s1, t1);
- ref += ref_stride;
- idx += 2;
- } while (idx < height_1);
+ for (int wd = 0; wd < width; wd += 16) {
+ const uint8_t *ref_tmp = ref + wd;
+ int16_t *hbuf_tmp = hbuf + wd;
+ __m128i s0 = zero;
+ __m128i s1 = zero;
+ int idx = 0;
+ do {
+ __m128i src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
+ __m128i t0 = _mm_unpacklo_epi8(src_line, zero);
+ __m128i t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, t0);
+ s1 = _mm_adds_epu16(s1, t1);
+ ref_tmp += ref_stride;
- src_line = _mm_loadu_si128((const __m128i *)ref);
- t0 = _mm_unpacklo_epi8(src_line, zero);
- t1 = _mm_unpackhi_epi8(src_line, zero);
- s0 = _mm_adds_epu16(s0, t0);
- s1 = _mm_adds_epu16(s1, t1);
- if (height == 128) {
- s0 = _mm_srai_epi16(s0, 6);
- s1 = _mm_srai_epi16(s1, 6);
- } else if (height == 64) {
- s0 = _mm_srai_epi16(s0, 5);
- s1 = _mm_srai_epi16(s1, 5);
- } else if (height == 32) {
- s0 = _mm_srai_epi16(s0, 4);
- s1 = _mm_srai_epi16(s1, 4);
- } else {
- assert(height == 16);
- s0 = _mm_srai_epi16(s0, 3);
- s1 = _mm_srai_epi16(s1, 3);
+ src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
+ t0 = _mm_unpacklo_epi8(src_line, zero);
+ t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, t0);
+ s1 = _mm_adds_epu16(s1, t1);
+ ref_tmp += ref_stride;
+ idx += 2;
+ } while (idx < height);
+
+ s0 = _mm_srai_epi16(s0, norm_factor);
+ s1 = _mm_srai_epi16(s1, norm_factor);
+ _mm_storeu_si128((__m128i *)(hbuf_tmp), s0);
+ _mm_storeu_si128((__m128i *)(hbuf_tmp + 8), s1);
}
-
- _mm_storeu_si128((__m128i *)hbuf, s0);
- hbuf += 8;
- _mm_storeu_si128((__m128i *)hbuf, s1);
}
-int16_t aom_int_pro_col_sse2(const uint8_t *ref, const int width) {
- __m128i zero = _mm_setzero_si128();
- __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
- __m128i s0 = _mm_sad_epu8(src_line, zero);
- __m128i s1;
- int i;
+void aom_int_pro_col_sse2(int16_t *vbuf, const uint8_t *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor) {
+ // SIMD implementation assumes width to be multiple of 16.
+ assert(width % 16 == 0);
- for (i = 16; i < width; i += 16) {
- ref += 16;
- src_line = _mm_loadu_si128((const __m128i *)ref);
- s1 = _mm_sad_epu8(src_line, zero);
+ for (int ht = 0; ht < height; ht++) {
+ const uint8_t *ref_tmp = ref + (ht * ref_stride);
+ __m128i zero = _mm_setzero_si128();
+ __m128i s0 = zero;
+ __m128i s1, src_line;
+ for (int i = 0; i < width; i += 16) {
+ src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
+ s1 = _mm_sad_epu8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, s1);
+ ref_tmp += 16;
+ }
+
+ s1 = _mm_srli_si128(s0, 8);
s0 = _mm_adds_epu16(s0, s1);
+ vbuf[ht] = _mm_extract_epi16(s0, 0) >> norm_factor;
}
-
- s1 = _mm_srli_si128(s0, 8);
- s0 = _mm_adds_epu16(s0, s1);
-
- return _mm_extract_epi16(s0, 0);
}
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 395e350..ff48fc3 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -1946,7 +1946,8 @@
uint8_t const *ref_buf, *src_buf;
int_mv *best_int_mv = &xd->mi[0]->mv[0];
unsigned int best_sad, tmp_sad, this_sad[4];
- const int norm_factor = 3 + (bw >> 5);
+ const int row_norm_factor = mi_size_high_log2[bsize] + 1;
+ const int col_norm_factor = 3 + (bw >> 5);
const YV12_BUFFER_CONFIG *scaled_ref_frame =
av1_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
static const MV search_pos[4] = {
@@ -1981,28 +1982,16 @@
// Set up prediction 1-D reference set
ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
- for (idx = 0; idx < search_width; idx += 16) {
- aom_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
- ref_buf += 16;
- }
+ aom_int_pro_row(hbuf, ref_buf, ref_stride, search_width, bh, row_norm_factor);
ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
- for (idx = 0; idx < search_height; ++idx) {
- vbuf[idx] = aom_int_pro_col(ref_buf, bw) >> norm_factor;
- ref_buf += ref_stride;
- }
+ aom_int_pro_col(vbuf, ref_buf, ref_stride, bw, search_height,
+ col_norm_factor);
// Set up src 1-D reference set
- for (idx = 0; idx < bw; idx += 16) {
- src_buf = x->plane[0].src.buf + idx;
- aom_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
- }
-
src_buf = x->plane[0].src.buf;
- for (idx = 0; idx < bh; ++idx) {
- src_vbuf[idx] = aom_int_pro_col(src_buf, bw) >> norm_factor;
- src_buf += src_stride;
- }
+ aom_int_pro_row(src_hbuf, src_buf, src_stride, bw, bh, row_norm_factor);
+ aom_int_pro_col(src_vbuf, src_buf, src_stride, bw, bh, col_norm_factor);
// Find the best match per 1-D search
best_int_mv->as_fullmv.col =
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 93f4c34..bcbf1b9 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -43,7 +43,7 @@
protected:
// Handle blocks up to 4 blocks 64x64 with stride up to 128
static const int kDataAlignment = 16;
- static const int kDataBlockSize = 64 * 128;
+ static const int kDataBlockSize = 128 * 128;
virtual void SetUp() {
const testing::TestInfo *const test_info =
@@ -343,20 +343,32 @@
}
#endif // CONFIG_AV1_HIGHBITDEPTH
-typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
- const int ref_stride, const int height);
+typedef void (*IntProRowFunc)(int16_t *hbuf, uint8_t const *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor);
-// Params: height, asm function, c function.
-typedef std::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;
+// Params: width, height, asm function, c function.
+typedef std::tuple<int, int, IntProRowFunc, IntProRowFunc> IntProRowParam;
class IntProRowTest : public AverageTestBase<uint8_t>,
public ::testing::WithParamInterface<IntProRowParam> {
public:
IntProRowTest()
- : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(nullptr),
+ : AverageTestBase(GET_PARAM(0), GET_PARAM(1)), hbuf_asm_(nullptr),
hbuf_c_(nullptr) {
- asm_func_ = GET_PARAM(1);
- c_func_ = GET_PARAM(2);
+ asm_func_ = GET_PARAM(2);
+ c_func_ = GET_PARAM(3);
+ }
+
+ void set_norm_factor() {
+ if (height_ == 128)
+ norm_factor_ = 6;
+ else if (height_ == 64)
+ norm_factor_ = 5;
+ else if (height_ == 32)
+ norm_factor_ = 4;
+ else if (height_ == 16)
+ norm_factor_ = 3;
}
protected:
@@ -366,10 +378,10 @@
ASSERT_NE(source_data_, nullptr);
hbuf_asm_ = static_cast<int16_t *>(
- aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));
+ aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * width_));
ASSERT_NE(hbuf_asm_, nullptr);
hbuf_c_ = static_cast<int16_t *>(
- aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
+ aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * width_));
ASSERT_NE(hbuf_c_, nullptr);
}
@@ -383,19 +395,24 @@
}
void RunComparison() {
- API_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
- API_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
- EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+ set_norm_factor();
+ API_REGISTER_STATE_CHECK(
+ c_func_(hbuf_c_, source_data_, width_, width_, height_, norm_factor_));
+ API_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, width_, width_,
+ height_, norm_factor_));
+ EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * width_))
<< "Output mismatch\n";
}
void RunSpeedTest() {
const int numIter = 5000000;
- printf("Height = %d number of iteration is %d \n", height_, numIter);
+ set_norm_factor();
+ printf("Blk_Size=%dx%d: number of iteration is %d \n", width_, height_,
+ numIter);
aom_usec_timer c_timer_;
aom_usec_timer_start(&c_timer_);
for (int i = 0; i < numIter; i++) {
- c_func_(hbuf_c_, source_data_, 0, height_);
+ c_func_(hbuf_c_, source_data_, width_, width_, height_, norm_factor_);
}
aom_usec_timer_mark(&c_timer_);
@@ -403,7 +420,7 @@
aom_usec_timer_start(&asm_timer_);
for (int i = 0; i < numIter; i++) {
- asm_func_(hbuf_asm_, source_data_, 0, height_);
+ asm_func_(hbuf_asm_, source_data_, width_, width_, height_, norm_factor_);
}
aom_usec_timer_mark(&asm_timer_);
@@ -415,7 +432,7 @@
asm_sum_time,
(static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
- EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+ EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * width_))
<< "Output mismatch\n";
}
@@ -424,35 +441,68 @@
IntProRowFunc c_func_;
int16_t *hbuf_asm_;
int16_t *hbuf_c_;
+ int norm_factor_;
};
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProRowTest);
-typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
+typedef void (*IntProColFunc)(int16_t *vbuf, uint8_t const *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor);
-// Params: width, asm function, c function.
-typedef std::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;
+// Params: width, height, asm function, c function.
+typedef std::tuple<int, int, IntProColFunc, IntProColFunc> IntProColParam;
class IntProColTest : public AverageTestBase<uint8_t>,
public ::testing::WithParamInterface<IntProColParam> {
public:
- IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) {
- asm_func_ = GET_PARAM(1);
- c_func_ = GET_PARAM(2);
+ IntProColTest()
+ : AverageTestBase(GET_PARAM(0), GET_PARAM(1)), vbuf_asm_(nullptr),
+ vbuf_c_(nullptr) {
+ asm_func_ = GET_PARAM(2);
+ c_func_ = GET_PARAM(3);
}
protected:
+ virtual void SetUp() {
+ source_data_ = static_cast<uint8_t *>(
+ aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
+ ASSERT_NE(source_data_, nullptr);
+
+ vbuf_asm_ = static_cast<int16_t *>(
+ aom_memalign(kDataAlignment, sizeof(*vbuf_asm_) * width_));
+ ASSERT_NE(vbuf_asm_, nullptr);
+ vbuf_c_ = static_cast<int16_t *>(
+ aom_memalign(kDataAlignment, sizeof(*vbuf_c_) * width_));
+ ASSERT_NE(vbuf_c_, nullptr);
+ }
+
+ virtual void TearDown() {
+ aom_free(source_data_);
+ source_data_ = nullptr;
+ aom_free(vbuf_c_);
+ vbuf_c_ = nullptr;
+ aom_free(vbuf_asm_);
+ vbuf_asm_ = nullptr;
+ }
+
void RunComparison() {
- API_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));
- API_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
- EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";
+ int norm_factor_ = 3 + (width_ >> 5);
+ API_REGISTER_STATE_CHECK(
+ c_func_(vbuf_c_, source_data_, width_, width_, height_, norm_factor_));
+ API_REGISTER_STATE_CHECK(asm_func_(vbuf_asm_, source_data_, width_, width_,
+ height_, norm_factor_));
+ EXPECT_EQ(0, memcmp(vbuf_c_, vbuf_asm_, sizeof(*vbuf_c_) * height_))
+ << "Output mismatch\n";
}
void RunSpeedTest() {
const int numIter = 5000000;
- printf("Width = %d number of iteration is %d \n", width_, numIter);
+ printf("Blk_Size=%dx%d: number of iteration is %d \n", width_, height_,
+ numIter);
+ int norm_factor_ = 3 + (width_ >> 5);
aom_usec_timer c_timer_;
aom_usec_timer_start(&c_timer_);
for (int i = 0; i < numIter; i++) {
- sum_c_ = c_func_(source_data_, width_);
+ c_func_(vbuf_c_, source_data_, width_, width_, height_, norm_factor_);
}
aom_usec_timer_mark(&c_timer_);
@@ -460,7 +510,7 @@
aom_usec_timer_start(&asm_timer_);
for (int i = 0; i < numIter; i++) {
- sum_asm_ = asm_func_(source_data_, width_);
+ asm_func_(vbuf_asm_, source_data_, width_, width_, height_, norm_factor_);
}
aom_usec_timer_mark(&asm_timer_);
@@ -472,14 +522,15 @@
asm_sum_time,
(static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
- EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch \n";
+ EXPECT_EQ(0, memcmp(vbuf_c_, vbuf_asm_, sizeof(*vbuf_c_) * height_))
+ << "Output mismatch\n";
}
private:
IntProColFunc asm_func_;
IntProColFunc c_func_;
- int16_t sum_asm_;
- int16_t sum_c_;
+ int16_t *vbuf_asm_;
+ int16_t *vbuf_c_;
};
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProColTest);
@@ -703,19 +754,19 @@
INSTANTIATE_TEST_SUITE_P(
SSE2, IntProRowTest,
- ::testing::Values(make_tuple(16, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
- make_tuple(32, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
- make_tuple(64, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
- make_tuple(128, &aom_int_pro_row_sse2,
- &aom_int_pro_row_c)));
+ ::testing::Values(
+ make_tuple(16, 16, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+ make_tuple(32, 32, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+ make_tuple(64, 64, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+ make_tuple(128, 128, &aom_int_pro_row_sse2, &aom_int_pro_row_c)));
INSTANTIATE_TEST_SUITE_P(
SSE2, IntProColTest,
- ::testing::Values(make_tuple(16, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
- make_tuple(32, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
- make_tuple(64, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
- make_tuple(128, &aom_int_pro_col_sse2,
- &aom_int_pro_col_c)));
+ ::testing::Values(
+ make_tuple(16, 16, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+ make_tuple(32, 32, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+ make_tuple(64, 64, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+ make_tuple(128, 128, &aom_int_pro_col_sse2, &aom_int_pro_col_c)));
#endif
#if HAVE_AVX2
@@ -724,6 +775,22 @@
::testing::Values(make_tuple(16, 16, 8, 0, 16, &aom_avg_8x8_quad_avx2),
make_tuple(32, 32, 8, 16, 16, &aom_avg_8x8_quad_avx2),
make_tuple(32, 32, 8, 8, 16, &aom_avg_8x8_quad_avx2)));
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, IntProRowTest,
+ ::testing::Values(
+ make_tuple(16, 16, &aom_int_pro_row_avx2, &aom_int_pro_row_c),
+ make_tuple(32, 32, &aom_int_pro_row_avx2, &aom_int_pro_row_c),
+ make_tuple(64, 64, &aom_int_pro_row_avx2, &aom_int_pro_row_c),
+ make_tuple(128, 128, &aom_int_pro_row_avx2, &aom_int_pro_row_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, IntProColTest,
+ ::testing::Values(
+ make_tuple(16, 16, &aom_int_pro_col_avx2, &aom_int_pro_col_c),
+ make_tuple(32, 32, &aom_int_pro_col_avx2, &aom_int_pro_col_c),
+ make_tuple(64, 64, &aom_int_pro_col_avx2, &aom_int_pro_col_c),
+ make_tuple(128, 128, &aom_int_pro_col_avx2, &aom_int_pro_col_c)));
#endif
#if HAVE_NEON
@@ -737,19 +804,19 @@
make_tuple(32, 32, 8, 15, 4, &aom_avg_4x4_neon)));
INSTANTIATE_TEST_SUITE_P(
NEON, IntProRowTest,
- ::testing::Values(make_tuple(16, &aom_int_pro_row_neon, &aom_int_pro_row_c),
- make_tuple(32, &aom_int_pro_row_neon, &aom_int_pro_row_c),
- make_tuple(64, &aom_int_pro_row_neon, &aom_int_pro_row_c),
- make_tuple(128, &aom_int_pro_row_neon,
- &aom_int_pro_row_c)));
+ ::testing::Values(
+ make_tuple(16, 16, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+ make_tuple(32, 32, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+ make_tuple(64, 64, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+ make_tuple(128, 128, &aom_int_pro_row_neon, &aom_int_pro_row_c)));
INSTANTIATE_TEST_SUITE_P(
NEON, IntProColTest,
- ::testing::Values(make_tuple(16, &aom_int_pro_col_neon, &aom_int_pro_col_c),
- make_tuple(32, &aom_int_pro_col_neon, &aom_int_pro_col_c),
- make_tuple(64, &aom_int_pro_col_neon, &aom_int_pro_col_c),
- make_tuple(128, &aom_int_pro_col_neon,
- &aom_int_pro_col_c)));
+ ::testing::Values(
+ make_tuple(16, 16, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+ make_tuple(32, 32, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+ make_tuple(64, 64, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+ make_tuple(128, 128, &aom_int_pro_col_neon, &aom_int_pro_col_c)));
INSTANTIATE_TEST_SUITE_P(
NEON, AvgTest8bpp_avg_8x8_quad,