Add optimized convolve functions for single reference case
Added optimized convolve functions for single reference case, so that no
separate post rounding is needed and the result is written to the
destination buffer directly. Duplicate code will be cleaned up later.
Change-Id: Iffc0cc6e135b8b6f45a95c314d63368f5aa35f34
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index c3c3613..2efc85f 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -560,26 +560,34 @@
}
# CONVOLVE_ROUND/COMPOUND_ROUND functions
-add_proto qw/void av1_convolve_2d/, "const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_convolve_2d sse2 avx2/;
+add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+specialize qw/av1_convolve_2d_sr c/;
add_proto qw/void av1_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits";
specialize qw/av1_convolve_rounding avx2/;
-add_proto qw/void av1_convolve_2d_copy/, "const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_convolve_2d_copy sse2/;
-add_proto qw/void av1_convolve_x/, "const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+specialize qw/av1_convolve_2d_copy_sr c/;
+add_proto qw/void av1_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_convolve_x sse2/;
-add_proto qw/void av1_convolve_y/, "const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_convolve_y sse2/;
+add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+specialize qw/av1_convolve_x_sr c/;
+add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+specialize qw/av1_convolve_y_sr c/;
add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
specialize qw/av1_convolve_2d_scale sse4_1/;
if (aom_config("CONFIG_JNT_COMP") eq "yes") {
- add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+ add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_jnt_convolve_2d sse4_1/;
- add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+ add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_jnt_convolve_2d_copy sse2/;
}
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 526e2ea..42d8f34 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -373,7 +373,7 @@
bit widths for various intermediate values, see the comments above
av1_warp_affine_c.
*/
-void av1_convolve_2d_c(const uint8_t *src, int src_stride, const uint8_t *dst0,
+void av1_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
@@ -429,7 +429,7 @@
}
}
-void av1_convolve_y_c(const uint8_t *src, int src_stride, const uint8_t *dst0,
+void av1_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
@@ -462,7 +462,7 @@
}
}
-void av1_convolve_x_c(const uint8_t *src, int src_stride, const uint8_t *dst0,
+void av1_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
@@ -495,8 +495,8 @@
}
}
-void av1_convolve_2d_copy_c(const uint8_t *src, int src_stride,
- const uint8_t *dst0, int dst_stride0, int w, int h,
+void av1_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst0,
+ int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
@@ -524,9 +524,132 @@
}
}
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ int32_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = w;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int bd = 8;
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+
+ // horizontal filter
+ const uint8_t *src_horiz = src - fo_vert * src_stride;
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ for (int y = 0; y < im_h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+ }
+ assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+ im_block[y * im_stride + x] =
+ ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+ }
+ }
+
+ // vertical filter
+ int32_t *src_vert = im_block + fo_vert * im_stride;
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ CONV_BUF_TYPE sum = 1 << offset_bits;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+ }
+ assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+ CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+ ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
+ }
+ }
+}
+
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+ (void)conv_params;
+
+ // vertical filter
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ CONV_BUF_TYPE res = 0;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+ }
+ dst[y * dst_stride + x] =
+ clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
+ }
+ }
+}
+
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ (void)filter_params_y;
+ (void)subpel_y_q4;
+ (void)conv_params;
+
+ // horizontal filter
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ CONV_BUF_TYPE res = 0;
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+ }
+ res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
+ }
+ }
+}
+
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+ (void)conv_params;
+
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ dst[y * dst_stride + x] = src[y * src_stride + x];
+ }
+ }
+}
+
#if CONFIG_JNT_COMP
-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride,
- const uint8_t *dst0, int dst_stride0, int w, int h,
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst0,
+ int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
@@ -593,8 +716,8 @@
}
void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
- const uint8_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
@@ -743,7 +866,7 @@
&filter_params_y, subpel_x_q4, x_step_q4, subpel_y_q4,
y_step_q4, conv_params);
else
- sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][1](
+ sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
src, src_stride, dst, dst_stride, w, h, &filter_params_x,
&filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
}
diff --git a/av1/common/convolve.h b/av1/common/convolve.h
index f2e74e5..6378bb1 100644
--- a/av1/common/convolve.h
+++ b/av1/common/convolve.h
@@ -44,8 +44,8 @@
} ConvolveParams;
typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride,
- const uint8_t *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
+ uint8_t *dst, int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params);
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index 5e17d96..c8b0181 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h
@@ -77,7 +77,11 @@
av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
interp_filters, subpel_x, xs, subpel_y, ys, 0,
conv_params, sf);
- conv_params->do_post_rounding = 1;
+
+ if (conv_params->is_compound)
+ conv_params->do_post_rounding = 1;
+ else
+ conv_params->do_post_rounding = 0;
} else {
assert(conv_params->round == CONVOLVE_OPT_ROUND);
diff --git a/av1/common/scale.c b/av1/common/scale.c
index 3f333f6..54ad762 100644
--- a/av1/common/scale.c
+++ b/av1/common/scale.c
@@ -182,9 +182,18 @@
}
#endif // CONFIG_HIGHBITDEPTH
-#if CONFIG_JNT_COMP
+ // AV1 convolve functions
// Special case convolve functions should produce the same result as
- // av1_int_convolve_2d.
+ // av1_convolve_2d.
+ // subpel_x_q4 == 0 && subpel_y_q4 == 0
+ sf->convolve[0][0][0] = av1_convolve_2d_copy_sr;
+ // subpel_x_q4 == 0
+ sf->convolve[0][1][0] = av1_convolve_y_sr;
+ // subpel_y_q4 == 0
+ sf->convolve[1][0][0] = av1_convolve_x_sr;
+ // subpel_x_q4 != 0 && subpel_y_q4 != 0
+ sf->convolve[1][1][0] = av1_convolve_2d_sr;
+#if CONFIG_JNT_COMP
// subpel_x_q4 == 0 && subpel_y_q4 == 0
sf->convolve[0][0][1] = av1_jnt_convolve_2d_copy;
// subpel_x_q4 == 0
@@ -196,8 +205,6 @@
// subpel_x_q4 != 0 && subpel_y_q4 != 0
sf->convolve[1][1][1] = av1_jnt_convolve_2d;
#else
- // Special case convolve functions should produce the same result as
- // av1_convolve_2d.
// subpel_x_q4 == 0 && subpel_y_q4 == 0
sf->convolve[0][0][1] = av1_convolve_2d_copy;
// subpel_x_q4 == 0
diff --git a/av1/common/x86/convolve_2d_avx2.c b/av1/common/x86/convolve_2d_avx2.c
index 4db1380..728e651 100644
--- a/av1/common/x86/convolve_2d_avx2.c
+++ b/av1/common/x86/convolve_2d_avx2.c
@@ -17,8 +17,8 @@
#include "aom_dsp/aom_filter.h"
#include "av1/common/convolve.h"
-void av1_convolve_2d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *dst0, int dst_stride0, int w, int h,
+void av1_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
+ int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
diff --git a/av1/common/x86/convolve_2d_sse2.c b/av1/common/x86/convolve_2d_sse2.c
index a0b911c..a73e710 100644
--- a/av1/common/x86/convolve_2d_sse2.c
+++ b/av1/common/x86/convolve_2d_sse2.c
@@ -17,8 +17,8 @@
#include "aom_dsp/aom_filter.h"
#include "av1/common/convolve.h"
-void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
- const uint8_t *dst0, int dst_stride0, int w, int h,
+void av1_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
+ int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
@@ -205,8 +205,8 @@
}
void av1_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
- const uint8_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
@@ -322,8 +322,8 @@
#if CONFIG_JNT_COMP
void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
- const uint8_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
diff --git a/av1/common/x86/convolve_2d_sse4.c b/av1/common/x86/convolve_2d_sse4.c
index 4fc946d..5479c38 100644
--- a/av1/common/x86/convolve_2d_sse4.c
+++ b/av1/common/x86/convolve_2d_sse4.c
@@ -20,8 +20,8 @@
#if CONFIG_JNT_COMP
void av1_jnt_convolve_2d_sse4_1(const uint8_t *src, int src_stride,
- const uint8_t *dst0, int dst_stride0, int w,
- int h, InterpFilterParams *filter_params_x,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
diff --git a/test/av1_convolve_2d_test_util.h b/test/av1_convolve_2d_test_util.h
index e31016c..4847ee7 100644
--- a/test/av1_convolve_2d_test_util.h
+++ b/test/av1_convolve_2d_test_util.h
@@ -25,8 +25,8 @@
namespace AV1Convolve2D {
typedef void (*convolve_2d_func)(const uint8_t *src, int src_stride,
- const uint8_t *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
+ uint8_t *dst, int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params);