Optimize highbd sub_pixel_variance module
Enabled SSE2 variant for sub_pixel_variance module of
blk_sizes 128x128, 128x64 and 64x128.
When tested for multiple test cases observed 1.25%
average reduction in encoder time for preset = 4.
Module level gains improved by a factor of ~12x
on average w.r.t to C code.
Change-Id: I49c80d586e2d52938086d9fb1ff99dcc371b0e04
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index c83a99f..0ee0b6e 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1370,6 +1370,15 @@
#
# Subpixel Variance
#
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2/;
+
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/;
@@ -1406,6 +1415,15 @@
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2/;
+
add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/;
@@ -1442,6 +1460,15 @@
add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2/;
+
add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/;
diff --git a/aom_dsp/x86/highbd_variance_sse2.c b/aom_dsp/x86/highbd_variance_sse2.c
index 2373d3a..fc5678d 100644
--- a/aom_dsp/x86/highbd_variance_sse2.c
+++ b/aom_dsp/x86/highbd_variance_sse2.c
@@ -287,30 +287,38 @@
uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt( \
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
- uint32_t sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
- NULL); \
- if (w > wf) { \
- unsigned int sse2; \
+ int se = 0; \
+ unsigned int sse = 0; \
+ unsigned int sse2; \
+ int row_rep = (w > 64) ? 2 : 1; \
+ for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \
+ src += wd_64 * 64; \
+ dst += wd_64 * 64; \
int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
- &sse2, NULL, NULL); \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse2, \
+ NULL, NULL); \
se += se2; \
sse += sse2; \
- if (w > wf * 2) { \
+ if (w > wf) { \
se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
+ src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
&sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
- se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
- &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
+ h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
+ h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
} \
} \
*sse_ptr = sse; \
@@ -322,33 +330,42 @@
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
int64_t var; \
uint32_t sse; \
+ uint64_t long_sse = 0; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
- NULL); \
- if (w > wf) { \
- uint32_t sse2; \
+ int se = 0; \
+ int row_rep = (w > 64) ? 2 : 1; \
+ for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \
+ src += wd_64 * 64; \
+ dst += wd_64 * 64; \
int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
- &sse2, NULL, NULL); \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
+ NULL); \
se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
+ long_sse += sse; \
+ if (w > wf) { \
+ uint32_t sse2; \
se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
+ src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
&sse2, NULL, NULL); \
se += se2; \
- sse += sse2; \
- se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
- &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
+ h, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
+ h, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
} \
} \
se = ROUND_POWER_OF_TWO(se, 2); \
- sse = ROUND_POWER_OF_TWO(sse, 4); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 4); \
*sse_ptr = sse; \
var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
return (var >= 0) ? (uint32_t)var : 0; \
@@ -364,35 +381,38 @@
uint64_t long_sse = 0; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ int row_rep = (w > 64) ? 2 : 1; \
for (start_row = 0; start_row < h; start_row += 16) { \
uint32_t sse2; \
int height = h - start_row < 16 ? h - start_row : 16; \
- int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + (start_row * src_stride), src_stride, x_offset, y_offset, \
- dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL, \
- NULL); \
- se += se2; \
- long_sse += sse2; \
- if (w > wf) { \
- se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 16 + (start_row * src_stride), src_stride, x_offset, \
- y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \
- &sse2, NULL, NULL); \
+ uint16_t *src_tmp = src + (start_row * src_stride); \
+ uint16_t *dst_tmp = dst + (start_row * dst_stride); \
+ for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \
+ src_tmp += wd_64 * 64; \
+ dst_tmp += wd_64 * 64; \
+ int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src_tmp, src_stride, x_offset, y_offset, dst_tmp, dst_stride, \
+ height, &sse2, NULL, NULL); \
se += se2; \
long_sse += sse2; \
- if (w > wf * 2) { \
+ if (w > wf) { \
se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 32 + (start_row * src_stride), src_stride, x_offset, \
- y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \
- height, &sse2, NULL, NULL); \
+ src_tmp + 16, src_stride, x_offset, y_offset, dst_tmp + 16, \
+ dst_stride, height, &sse2, NULL, NULL); \
se += se2; \
long_sse += sse2; \
- se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 48 + (start_row * src_stride), src_stride, x_offset, \
- y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \
- height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src_tmp + 32, src_stride, x_offset, y_offset, dst_tmp + 32, \
+ dst_stride, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src_tmp + 48, src_stride, x_offset, y_offset, dst_tmp + 48, \
+ dst_stride, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
} \
} \
} \
@@ -403,22 +423,25 @@
return (var >= 0) ? (uint32_t)var : 0; \
}
-#define FNS(opt) \
- FN(64, 64, 16, 6, 6, opt, (int64_t)); \
- FN(64, 32, 16, 6, 5, opt, (int64_t)); \
- FN(32, 64, 16, 5, 6, opt, (int64_t)); \
- FN(32, 32, 16, 5, 5, opt, (int64_t)); \
- FN(32, 16, 16, 5, 4, opt, (int64_t)); \
- FN(16, 32, 16, 4, 5, opt, (int64_t)); \
- FN(16, 16, 16, 4, 4, opt, (int64_t)); \
- FN(16, 8, 16, 4, 3, opt, (int64_t)); \
- FN(8, 16, 8, 3, 4, opt, (int64_t)); \
- FN(8, 8, 8, 3, 3, opt, (int64_t)); \
- FN(8, 4, 8, 3, 2, opt, (int64_t)); \
- FN(16, 4, 16, 4, 2, opt, (int64_t)); \
- FN(8, 32, 8, 3, 5, opt, (int64_t)); \
- FN(32, 8, 16, 5, 3, opt, (int64_t)); \
- FN(16, 64, 16, 4, 6, opt, (int64_t)); \
+#define FNS(opt) \
+ FN(128, 128, 16, 7, 7, opt, (int64_t)); \
+ FN(128, 64, 16, 7, 6, opt, (int64_t)); \
+ FN(64, 128, 16, 6, 7, opt, (int64_t)); \
+ FN(64, 64, 16, 6, 6, opt, (int64_t)); \
+ FN(64, 32, 16, 6, 5, opt, (int64_t)); \
+ FN(32, 64, 16, 5, 6, opt, (int64_t)); \
+ FN(32, 32, 16, 5, 5, opt, (int64_t)); \
+ FN(32, 16, 16, 5, 4, opt, (int64_t)); \
+ FN(16, 32, 16, 4, 5, opt, (int64_t)); \
+ FN(16, 16, 16, 4, 4, opt, (int64_t)); \
+ FN(16, 8, 16, 4, 3, opt, (int64_t)); \
+ FN(8, 16, 8, 3, 4, opt, (int64_t)); \
+ FN(8, 8, 8, 3, 3, opt, (int64_t)); \
+ FN(8, 4, 8, 3, 2, opt, (int64_t)); \
+ FN(16, 4, 16, 4, 2, opt, (int64_t)); \
+ FN(8, 32, 8, 3, 5, opt, (int64_t)); \
+ FN(32, 8, 16, 5, 3, opt, (int64_t)); \
+ FN(16, 64, 16, 4, 6, opt, (int64_t)); \
FN(64, 16, 16, 6, 4, opt, (int64_t))
FNS(sse2);
diff --git a/test/variance_test.cc b/test/variance_test.cc
index d57fef9..1942de0 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -703,6 +703,7 @@
protected:
void RefTest();
void ExtremeRefTest();
+ void SpeedTest();
ACMRandom rnd_;
uint8_t *src_;
@@ -785,6 +786,41 @@
}
}
+template <typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::SpeedTest() {
+ if (!use_high_bit_depth()) {
+ for (int j = 0; j < block_size(); j++) {
+ src_[j] = rnd_.Rand8();
+ }
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+ ref_[j] = rnd_.Rand8();
+ }
+ } else {
+ for (int j = 0; j < block_size(); j++) {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+ }
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+ }
+ }
+
+ unsigned int sse1;
+ int run_time = 1000000000 / block_size();
+ aom_usec_timer timer;
+
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_time; ++i) {
+ int x = rnd_(8);
+ int y = rnd_(8);
+ params_.func(ref_, width() + 1, x, y, src_, width(), &sse1);
+ }
+ aom_usec_timer_mark(&timer);
+
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ printf("sub_pixel_variance_%dx%d_%d: %d us\n", width(), height(),
+ params_.bit_depth, elapsed_time);
+}
+
template <>
void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
for (int x = 0; x < 8; ++x) {
@@ -1188,6 +1224,7 @@
TEST_P(AvxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); }
TEST_P(AvxHBDSubpelVarianceTest, Ref) { RefTest(); }
TEST_P(AvxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(AvxHBDSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
TEST_P(AvxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
/* TODO(debargha): This test does not support the highbd version
@@ -1677,6 +1714,9 @@
#endif // HAVE_AVX2
const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = {
+ SubpelVarianceParams(7, 7, &aom_highbd_12_sub_pixel_variance128x128_sse2, 12),
+ SubpelVarianceParams(7, 6, &aom_highbd_12_sub_pixel_variance128x64_sse2, 12),
+ SubpelVarianceParams(6, 7, &aom_highbd_12_sub_pixel_variance64x128_sse2, 12),
SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_sse2, 12),
SubpelVarianceParams(6, 5, &aom_highbd_12_sub_pixel_variance64x32_sse2, 12),
SubpelVarianceParams(5, 6, &aom_highbd_12_sub_pixel_variance32x64_sse2, 12),
@@ -1688,6 +1728,9 @@
SubpelVarianceParams(3, 4, &aom_highbd_12_sub_pixel_variance8x16_sse2, 12),
SubpelVarianceParams(3, 3, &aom_highbd_12_sub_pixel_variance8x8_sse2, 12),
SubpelVarianceParams(3, 2, &aom_highbd_12_sub_pixel_variance8x4_sse2, 12),
+ SubpelVarianceParams(7, 7, &aom_highbd_10_sub_pixel_variance128x128_sse2, 10),
+ SubpelVarianceParams(7, 6, &aom_highbd_10_sub_pixel_variance128x64_sse2, 10),
+ SubpelVarianceParams(6, 7, &aom_highbd_10_sub_pixel_variance64x128_sse2, 10),
SubpelVarianceParams(6, 6, &aom_highbd_10_sub_pixel_variance64x64_sse2, 10),
SubpelVarianceParams(6, 5, &aom_highbd_10_sub_pixel_variance64x32_sse2, 10),
SubpelVarianceParams(5, 6, &aom_highbd_10_sub_pixel_variance32x64_sse2, 10),
@@ -1699,6 +1742,9 @@
SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_sse2, 10),
SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_sse2, 10),
SubpelVarianceParams(3, 2, &aom_highbd_10_sub_pixel_variance8x4_sse2, 10),
+ SubpelVarianceParams(7, 7, &aom_highbd_8_sub_pixel_variance128x128_sse2, 8),
+ SubpelVarianceParams(7, 6, &aom_highbd_8_sub_pixel_variance128x64_sse2, 8),
+ SubpelVarianceParams(6, 7, &aom_highbd_8_sub_pixel_variance64x128_sse2, 8),
SubpelVarianceParams(6, 6, &aom_highbd_8_sub_pixel_variance64x64_sse2, 8),
SubpelVarianceParams(6, 5, &aom_highbd_8_sub_pixel_variance64x32_sse2, 8),
SubpelVarianceParams(5, 6, &aom_highbd_8_sub_pixel_variance32x64_sse2, 8),
@@ -1711,7 +1757,6 @@
SubpelVarianceParams(3, 3, &aom_highbd_8_sub_pixel_variance8x8_sse2, 8),
SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_sse2, 8)
};
-
INSTANTIATE_TEST_CASE_P(SSE2, AvxHBDSubpelVarianceTest,
::testing::ValuesIn(kArrayHBDSubpelVariance_sse2));