Facilitate SIMD optimization of subpel motion search
To facilitate simd optimization of 4-tap filter in
subpel motion search, unit tests are corrected to do
a C vs SIMD comparison.
Change-Id: I502dae35bb17d5ce9cf47e49bc6b46ba73deb37c
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 040ac16..27e95b9 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -909,6 +909,13 @@
int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search";
specialize qw/aom_jnt_comp_avg_upsampled_pred ssse3/;
+ add_proto qw/void aom_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+ int subpel_search";
+ specialize qw/aom_comp_mask_upsampled_pred sse2/;
+
add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index db179d6..23b7153 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c
@@ -382,13 +382,13 @@
} else if (!subpel_y_q3) {
const int16_t *const kernel =
av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
- aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
- width, height);
+ aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
+ -1, width, height);
} else if (!subpel_x_q3) {
const int16_t *const kernel =
av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
- aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
- width, height);
+ aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
+ 16, width, height);
} else {
DECLARE_ALIGNED(16, uint8_t,
temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
@@ -399,12 +399,12 @@
const int intermediate_height =
(((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
- aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1),
- ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
- width, intermediate_height);
- aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
- MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
- width, height);
+ aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
+ ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
+ width, intermediate_height);
+ aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
+ MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
+ width, height);
}
}
@@ -455,8 +455,9 @@
const int fwd_offset = jcp_param->fwd_offset;
const int bck_offset = jcp_param->bck_offset;
- aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
- subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
+ aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search);
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
@@ -1092,23 +1093,23 @@
}
}
-void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm,
- int mi_row, int mi_col, const MV *const mv,
- uint8_t *comp_pred, const uint8_t *pred,
- int width, int height, int subpel_x_q3,
- int subpel_y_q3, const uint8_t *ref,
- int ref_stride, const uint8_t *mask,
- int mask_stride, int invert_mask,
- int subpel_search) {
+void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, int subpel_x_q3,
+ int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask,
+ int subpel_search) {
if (subpel_x_q3 | subpel_y_q3) {
- aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
- subpel_x_q3, subpel_y_q3, ref, ref_stride,
- subpel_search);
+ aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search);
ref = comp_pred;
ref_stride = width;
}
- aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
- mask_stride, invert_mask);
+ aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask,
+ mask_stride, invert_mask);
}
#define MASK_SUBPIX_VAR(W, H) \
diff --git a/aom_dsp/variance.h b/aom_dsp/variance.h
index d29c6e4..77ad261 100644
--- a/aom_dsp/variance.h
+++ b/aom_dsp/variance.h
@@ -70,13 +70,6 @@
const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-void aom_comp_mask_upsampled_pred(
- MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
- int subpel_search);
-
void aom_highbd_comp_mask_upsampled_pred(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
diff --git a/aom_dsp/x86/convolve.h b/aom_dsp/x86/convolve.h
index b7c6980..001ed47 100644
--- a/aom_dsp/x86/convolve.h
+++ b/aom_dsp/x86/convolve.h
@@ -33,7 +33,30 @@
(void)y_step_q4; \
assert((-128 <= filter[3]) && (filter[3] <= 127)); \
assert(step_q4 == 16); \
- if (filter[0] | filter[1] | filter[2]) { \
+ if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \
+ (filter[2] | filter[5])) { \
+ while (w >= 16) { \
+ aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else if (filter[0] | filter[1] | filter[2]) { \
while (w >= 16) { \
aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
dst_stride, h, filter); \
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index 1a27fd2..051dff6 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -575,6 +575,7 @@
(subpel_search == 1)
? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
: av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+ int filter_taps = SUBPEL_TAPS;
if (!subpel_x_q3 && !subpel_y_q3) {
if (width >= 16) {
@@ -636,15 +637,16 @@
av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
const int16_t *const kernel_y =
av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
- const int intermediate_height =
- (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+ const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
+ uint8_t *temp_start_horiz = temp;
+ uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
+ int intermediate_height =
+ (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
- aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1),
- ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
- width, intermediate_height);
- aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
- MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
- width, height);
+ aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
+ kernel_x, 16, NULL, -1, width, intermediate_height);
+ aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
+ kernel_y, 16, width, height);
}
}
@@ -669,6 +671,23 @@
}
}
+void aom_comp_mask_upsampled_pred_sse2(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+ int subpel_search) {
+ if (subpel_x_q3 | subpel_y_q3) {
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search);
+ ref = comp_pred;
+ ref_stride = width;
+ }
+ aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
+ mask_stride, invert_mask);
+}
+
static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
const __m128i s1,
const __m128i a) {
diff --git a/test/comp_mask_variance_test.cc b/test/comp_mask_variance_test.cc
index e663469..34be2aa 100644
--- a/test/comp_mask_variance_test.cc
+++ b/test/comp_mask_variance_test.cc
@@ -200,10 +200,10 @@
const uint8_t *mask =
av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
- aom_comp_mask_pred = aom_comp_mask_pred_c; // ref
- aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred1_, pred_,
- w, h, subx, suby, ref_, MAX_SB_SIZE, mask,
- w, inv, subpel_search);
+ // ref
+ aom_comp_mask_upsampled_pred_c(
+ NULL, NULL, 0, 0, NULL, comp_pred1_, pred_, w, h, subx, suby, ref_,
+ MAX_SB_SIZE, mask, w, inv, subpel_search);
aom_comp_mask_pred = test_impl; // test
aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred2_, pred_,