Facilitate SIMD opt for highbd subpel motion search
To facilitate simd optimization of 4-tap filter in
highbd subpel motion search, unit tests are corrected to do
a C vs SIMD comparison.
Change-Id: Ifd0bea87af6ab80f3dc5a2f95354185dc49ae752
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index d298152..51e0621 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -931,7 +931,6 @@
int subpel_search";
specialize qw/aom_comp_mask_upsampled_pred sse2/;
-
add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index 961d213..0f4990e 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c
@@ -969,13 +969,13 @@
} else if (!subpel_y_q3) {
const int16_t *const kernel =
av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
- aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
- NULL, -1, width, height, bd);
+ aom_highbd_convolve8_horiz_c(ref8, ref_stride, comp_pred8, width, kernel,
+ 16, NULL, -1, width, height, bd);
} else if (!subpel_x_q3) {
const int16_t *const kernel =
av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
- aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
- kernel, 16, width, height, bd);
+ aom_highbd_convolve8_vert_c(ref8, ref_stride, comp_pred8, width, NULL, -1,
+ kernel, 16, width, height, bd);
} else {
DECLARE_ALIGNED(16, uint16_t,
temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
@@ -986,11 +986,11 @@
const int intermediate_height =
(((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
- aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1),
- ref_stride, CONVERT_TO_BYTEPTR(temp),
- MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
- intermediate_height, bd);
- aom_highbd_convolve8_vert(
+ aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter->taps >> 1) - 1),
+ ref_stride, CONVERT_TO_BYTEPTR(temp),
+ MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+ intermediate_height, bd);
+ aom_highbd_convolve8_vert_c(
CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
bd);
@@ -1052,9 +1052,9 @@
const int bck_offset = jcp_param->bck_offset;
const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
- aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
- height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
- bd, subpel_search);
+ aom_highbd_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+ height, subpel_x_q3, subpel_y_q3, ref8,
+ ref_stride, bd, subpel_search);
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
diff --git a/aom_dsp/x86/convolve.h b/aom_dsp/x86/convolve.h
index 3e19682..bad9c62 100644
--- a/aom_dsp/x86/convolve.h
+++ b/aom_dsp/x86/convolve.h
@@ -122,7 +122,30 @@
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
if (step_q4 == 16 && filter[3] != 128) { \
- if (filter[0] | filter[1] | filter[2]) { \
+ if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \
+ (filter[2] | filter[5])) { \
+ while (w >= 16) { \
+ aom_highbd_filter_block1d16_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_highbd_filter_block1d8_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_highbd_filter_block1d4_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else if (filter[0] | filter[1] | filter[2]) { \
while (w >= 16) { \
aom_highbd_filter_block1d16_##dir##8_##avg##opt( \
src_start, src_stride, dst, dst_stride, h, filter, bd); \
diff --git a/aom_dsp/x86/highbd_variance_sse2.c b/aom_dsp/x86/highbd_variance_sse2.c
index 48b8d15..098cf96 100644
--- a/aom_dsp/x86/highbd_variance_sse2.c
+++ b/aom_dsp/x86/highbd_variance_sse2.c
@@ -678,7 +678,7 @@
}
const InterpFilterParams *filter = av1_get_filter(subpel_search);
-
+ int filter_taps = SUBPEL_TAPS;
if (!subpel_x_q3 && !subpel_y_q3) {
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
@@ -726,17 +726,18 @@
av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
const int16_t *const kernel_y =
av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ const uint8_t *ref_start = ref8 - ref_stride * ((filter_taps >> 1) - 1);
+ uint16_t *temp_start_horiz = temp;
+ uint16_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
const int intermediate_height =
- (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+ (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
- aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1),
- ref_stride, CONVERT_TO_BYTEPTR(temp),
- MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
- intermediate_height, bd);
- aom_highbd_convolve8_vert(
- CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
- MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
- bd);
+ aom_highbd_convolve8_horiz(
+ ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz),
+ MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd);
+ aom_highbd_convolve8_vert(CONVERT_TO_BYTEPTR(temp_start_vert), MAX_SB_SIZE,
+ comp_pred8, width, NULL, -1, kernel_y, 16, width,
+ height, bd);
}
}
diff --git a/test/comp_avg_pred_test.h b/test/comp_avg_pred_test.h
index 9661dd9..11fa743 100644
--- a/test/comp_avg_pred_test.h
+++ b/test/comp_avg_pred_test.h
@@ -451,7 +451,8 @@
jnt_comp_params.use_jnt_comp_avg = 1;
int sub_x_q3, sub_y_q3;
int subpel_search;
- for (subpel_search = 1; subpel_search <= 2; ++subpel_search) {
+ for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
+ ++subpel_search) {
for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
for (int ii = 0; ii < 2; ii++) {
diff --git a/test/comp_mask_variance_test.cc b/test/comp_mask_variance_test.cc
index 33e3d55..53ba56c 100644
--- a/test/comp_mask_variance_test.cc
+++ b/test/comp_mask_variance_test.cc
@@ -459,6 +459,7 @@
void AV1HighbdCompMaskUpVarianceTest::RunCheckOutput(
highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv) {
+ (void)test_impl;
int bd_ = GET_PARAM(2);
const int w = block_size_wide[bsize];
const int h = block_size_high[bsize];
@@ -481,19 +482,24 @@
const uint8_t *mask =
av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
- aom_highbd_comp_mask_pred = aom_highbd_comp_mask_pred_c; // ref
- aom_highbd_comp_mask_upsampled_pred(
- NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(comp_pred1_),
- CONVERT_TO_BYTEPTR(pred_), w, h, subx, suby,
- CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv, bd_,
- subpel_search);
+ // ref
+ aom_highbd_upsampled_pred_c(
+ NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(comp_pred1_), w, h, subx,
+ suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, bd_, subpel_search);
- aom_highbd_comp_mask_pred = test_impl; // test
- aom_highbd_comp_mask_upsampled_pred(
- NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(comp_pred2_),
- CONVERT_TO_BYTEPTR(pred_), w, h, subx, suby,
- CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv, bd_,
- subpel_search);
+ aom_highbd_comp_mask_pred_c(
+ CONVERT_TO_BYTEPTR(comp_pred1_), CONVERT_TO_BYTEPTR(pred_), w, h,
+ CONVERT_TO_BYTEPTR(comp_pred1_), w, mask, w, inv);
+
+ // test
+ aom_highbd_upsampled_pred(
+ NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(comp_pred2_), w, h, subx,
+ suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, bd_, subpel_search);
+
+ aom_highbd_comp_mask_pred(
+ CONVERT_TO_BYTEPTR(comp_pred2_), CONVERT_TO_BYTEPTR(pred_), w, h,
+ CONVERT_TO_BYTEPTR(comp_pred2_), w, mask, w, inv);
+
ASSERT_EQ(CheckResult(w, h), true)
<< " wedge " << wedge_index << " inv " << inv << "sub (" << subx
<< "," << suby << ")";