Stop using VP9 convolve scheme in AV1 encoder.
Discontinue all VP9 style convolve rounding operations in the non-normative
parts of the encoder.
The function av1_convolve_2d_sr_c is forced instead of SIMD versions
of the same function, because of incompatibility when round_1 > 0.
In the -DCONFIG_LOWPRECISION_BLEND=2 -DCONFIG_HIGHPRECISION_INTBUF=1
setting, results on 15 frames of lowres (cpu-used=1) is -0.019% better.
Change-Id: I72154bd896357c352c944fb2cd3b25bafafba46a
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 2114451..a56eb94 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -453,19 +453,16 @@
add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
if (aom_config("CONFIG_JNT_COMP") eq "yes") {
- if (aom_config("CONFIG_JNT_COMP") eq "yes") {
- specialize qw/av1_warp_affine sse4_1/;
- }
+ specialize qw/av1_warp_affine sse4_1/;
} else {
specialize qw/av1_warp_affine sse2 ssse3/;
}
add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+
if (aom_config("CONFIG_JNT_COMP") eq "yes") {
- if (aom_config("CONFIG_JNT_COMP") eq "yes") {
- specialize qw/av1_highbd_warp_affine sse4_1/;
- }
+ specialize qw/av1_highbd_warp_affine sse4_1/;
} else {
specialize qw/av1_highbd_warp_affine ssse3/;
}
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 95458c1..8bc6f6d 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -1159,9 +1159,6 @@
const int subpel_y_q4, int y_step_q4,
int scaled, ConvolveParams *conv_params,
int bd) {
- (void)dst;
- (void)dst_stride;
-
InterpFilterParams filter_params_x, filter_params_y;
#if CONFIG_SHORT_FILTER
av1_get_convolve_filter_params(interp_filters, &filter_params_x,
@@ -1172,71 +1169,101 @@
#endif
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- if (filter_params_y.taps < filter_params_x.taps) {
- uint16_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) *
- (MAX_SB_SIZE + MAX_FILTER_TAP - 1)];
- int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
- CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE];
- int tr_dst_stride = MAX_SB_SIZE;
- int fo_vert = filter_params_y.taps / 2 - 1;
- int fo_horiz = filter_params_x.taps / 2 - 1;
+ if (conv_params->dst) {
+ if (filter_params_y.taps < filter_params_x.taps) {
+ uint16_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) *
+ (MAX_SB_SIZE + MAX_FILTER_TAP - 1)];
+ int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
+ CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE];
+ int tr_dst_stride = MAX_SB_SIZE;
+ int fo_vert = filter_params_y.taps / 2 - 1;
+ int fo_horiz = filter_params_x.taps / 2 - 1;
- transpose_uint16(
- tr_src, tr_src_stride, src - fo_vert * src_stride - fo_horiz,
- src_stride, w + filter_params_x.taps - 1, h + filter_params_y.taps - 1);
- transpose_int32(tr_dst, tr_dst_stride, conv_params->dst,
- conv_params->dst_stride, w, h);
+ transpose_uint16(tr_src, tr_src_stride,
+ src - fo_vert * src_stride - fo_horiz, src_stride,
+ w + filter_params_x.taps - 1,
+ h + filter_params_y.taps - 1);
+ transpose_int32(tr_dst, tr_dst_stride, conv_params->dst,
+ conv_params->dst_stride, w, h);
-// horizontal and vertical parameters are swapped because of the transpose
+ // horizontal and vertical parameters are swapped because of the transpose
#if CONFIG_JNT_COMP
- if (scaled)
- av1_highbd_convolve_2d_scale(
- tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst,
- tr_dst_stride, h, w, &filter_params_y, &filter_params_x, subpel_y_q4,
- y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd);
- else
- av1_highbd_jnt_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
- tr_src_stride, tr_dst, tr_dst_stride, h, w,
- &filter_params_y, &filter_params_x,
- subpel_y_q4, subpel_x_q4, conv_params, bd);
+ if (scaled)
+ av1_highbd_convolve_2d_scale(
+ tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst,
+ tr_dst_stride, h, w, &filter_params_y, &filter_params_x,
+ subpel_y_q4, y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd);
+ else
+ av1_highbd_jnt_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
+ tr_src_stride, tr_dst, tr_dst_stride, h, w,
+ &filter_params_y, &filter_params_x,
+ subpel_y_q4, subpel_x_q4, conv_params, bd);
#else
- if (scaled)
- av1_highbd_convolve_2d_scale(
- tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst,
- tr_dst_stride, h, w, &filter_params_y, &filter_params_x, subpel_y_q4,
- y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd);
- else
- av1_highbd_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
- tr_src_stride, tr_dst, tr_dst_stride, h, w,
- &filter_params_y, &filter_params_x, subpel_y_q4,
- subpel_x_q4, conv_params, bd);
+ if (scaled)
+ av1_highbd_convolve_2d_scale(
+ tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst,
+ tr_dst_stride, h, w, &filter_params_y, &filter_params_x,
+ subpel_y_q4, y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd);
+ else
+ av1_highbd_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
+ tr_src_stride, tr_dst, tr_dst_stride, h, w,
+ &filter_params_y, &filter_params_x, subpel_y_q4,
+ subpel_x_q4, conv_params, bd);
#endif // CONFIG_JNT_COMP
- transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst,
- tr_dst_stride, h, w);
+ transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst,
+ tr_dst_stride, h, w);
+ } else {
+#if CONFIG_JNT_COMP
+ if (scaled)
+ av1_highbd_convolve_2d_scale(
+ src, src_stride, conv_params->dst, conv_params->dst_stride, w, h,
+ &filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4,
+ subpel_y_q4, y_step_q4, conv_params, bd);
+ else
+ av1_highbd_jnt_convolve_2d(src, src_stride, conv_params->dst,
+ conv_params->dst_stride, w, h,
+ &filter_params_x, &filter_params_y,
+ subpel_x_q4, subpel_y_q4, conv_params, bd);
+#else
+ if (scaled)
+ av1_highbd_convolve_2d_scale(
+ src, src_stride, conv_params->dst, conv_params->dst_stride, w, h,
+ &filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4,
+ subpel_y_q4, y_step_q4, conv_params, bd);
+ else
+ av1_highbd_convolve_2d(src, src_stride, conv_params->dst,
+ conv_params->dst_stride, w, h, &filter_params_x,
+ &filter_params_y, subpel_x_q4, subpel_y_q4,
+ conv_params, bd);
+#endif // CONFIG_JNT_COMP
+ }
} else {
+ CONV_BUF_TYPE tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE];
+ int tmp_dst_stride = MAX_SB_SIZE;
#if CONFIG_JNT_COMP
if (scaled)
- av1_highbd_convolve_2d_scale(
- src, src_stride, conv_params->dst, conv_params->dst_stride, w, h,
- &filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4,
- subpel_y_q4, y_step_q4, conv_params, bd);
+ av1_highbd_convolve_2d_scale(src, src_stride, tmp_dst, tmp_dst_stride, w,
+ h, &filter_params_x, &filter_params_y,
+ subpel_x_q4, x_step_q4, subpel_y_q4,
+ y_step_q4, conv_params, bd);
else
- av1_highbd_jnt_convolve_2d(src, src_stride, conv_params->dst,
- conv_params->dst_stride, w, h,
+ av1_highbd_jnt_convolve_2d(src, src_stride, tmp_dst, tmp_dst_stride, w, h,
&filter_params_x, &filter_params_y,
subpel_x_q4, subpel_y_q4, conv_params, bd);
#else
if (scaled)
- av1_highbd_convolve_2d_scale(
- src, src_stride, conv_params->dst, conv_params->dst_stride, w, h,
- &filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4,
- subpel_y_q4, y_step_q4, conv_params, bd);
+ av1_highbd_convolve_2d_scale(src, src_stride, tmp_dst, tmp_dst_stride, w,
+ h, &filter_params_x, &filter_params_y,
+ subpel_x_q4, x_step_q4, subpel_y_q4,
+ y_step_q4, conv_params, bd);
else
- av1_highbd_convolve_2d(src, src_stride, conv_params->dst,
- conv_params->dst_stride, w, h, &filter_params_x,
- &filter_params_y, subpel_x_q4, subpel_y_q4,
- conv_params, bd);
+ av1_highbd_convolve_2d(src, src_stride, tmp_dst, tmp_dst_stride, w, h,
+ &filter_params_x, &filter_params_y, subpel_x_q4,
+ subpel_y_q4, conv_params, bd);
#endif // CONFIG_JNT_COMP
+ // 0-bit rounding just to convert from int32 to uint16
+ av1_highbd_convolve_rounding(tmp_dst, tmp_dst_stride, dst, dst_stride, w, h,
+ 0, bd);
}
}
diff --git a/av1/common/convolve.h b/av1/common/convolve.h
index b95a58f..7bbf220 100644
--- a/av1/common/convolve.h
+++ b/av1/common/convolve.h
@@ -60,27 +60,6 @@
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params);
-static INLINE ConvolveParams get_conv_params(int ref, int do_average, int plane,
- int bd) {
- ConvolveParams conv_params;
- conv_params.ref = ref;
- conv_params.do_average = do_average;
- conv_params.round = CONVOLVE_OPT_ROUND;
- conv_params.plane = plane;
- conv_params.do_post_rounding = 0;
- conv_params.round_0 = ROUND0_BITS;
- conv_params.round_1 = 0;
- conv_params.is_compound = 0;
- conv_params.dst = NULL;
- conv_params.dst_stride = 0;
- const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
- if (bd < 12) assert(intbufrange <= 16);
- if (intbufrange > 16) {
- conv_params.round_0 += intbufrange - 16;
- }
- return conv_params;
-}
-
static INLINE void av1_get_convolve_filter_params(InterpFilters interp_filters,
InterpFilterParams *params_x,
InterpFilterParams *params_y
@@ -107,6 +86,7 @@
struct AV1Common;
struct scale_factors;
+
void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
InterpFilters interp_filters, const int subpel_x_q4,
@@ -114,6 +94,27 @@
int scaled, ConvolveParams *conv_params,
const struct scale_factors *sf);
+static INLINE ConvolveParams get_conv_params_round(int ref, int do_average,
+ int plane, int bd) {
+ ConvolveParams conv_params;
+ conv_params.ref = ref;
+ conv_params.do_average = do_average;
+ conv_params.plane = plane;
+ conv_params.round = CONVOLVE_OPT_ROUND;
+ conv_params.round_0 = ROUND0_BITS;
+ conv_params.round_1 = 0;
+ conv_params.do_post_rounding = 0;
+ conv_params.is_compound = 0;
+ conv_params.dst = NULL;
+ conv_params.dst_stride = 0;
+ const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
+ if (bd < 12) assert(intbufrange <= 16);
+ if (intbufrange > 16) {
+ conv_params.round_0 += intbufrange - 16;
+ }
+ return conv_params;
+}
+
static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average,
int plane, int32_t *dst,
int dst_stride,
@@ -125,7 +126,8 @@
conv_params.is_compound = is_compound;
conv_params.round_0 = ROUND0_BITS;
#if CONFIG_LOWPRECISION_BLEND
- conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS : 0;
+ conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS
+ : 2 * FILTER_BITS - conv_params.round_0;
#else
conv_params.round_1 = 0;
#endif
@@ -145,6 +147,11 @@
return conv_params;
}
+static INLINE ConvolveParams get_conv_params(int ref, int do_average, int plane,
+ int bd) {
+ return get_conv_params_no_round(ref, do_average, plane, NULL, 0, 0, bd);
+}
+
void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
InterpFilters interp_filters,
diff --git a/av1/common/mv.h b/av1/common/mv.h
index 4c547fe..2f9011a 100644
--- a/av1/common/mv.h
+++ b/av1/common/mv.h
@@ -58,7 +58,7 @@
#define WARP_PARAM_REDUCE_BITS 6
// Precision bits reduction after horizontal shear
-#define HORSHEAR_REDUCE_PREC_BITS 5
+#define HORSHEAR_REDUCE_PREC_BITS 3
#define VERSHEAR_REDUCE_PREC_BITS \
(2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)
diff --git a/av1/common/scale.c b/av1/common/scale.c
index a334bae..b43de0e 100644
--- a/av1/common/scale.c
+++ b/av1/common/scale.c
@@ -185,7 +185,7 @@
// subpel_y_q4 == 0
sf->convolve[1][0][0] = av1_convolve_x_sr;
// subpel_x_q4 != 0 && subpel_y_q4 != 0
- sf->convolve[1][1][0] = av1_convolve_2d_sr;
+ sf->convolve[1][1][0] = av1_convolve_2d_sr_c;
#if CONFIG_JNT_COMP
// subpel_x_q4 == 0 && subpel_y_q4 == 0
sf->convolve[0][0][1] = av1_jnt_convolve_2d_copy;
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index 5daa2cf..71cd85a 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -422,19 +422,24 @@
ConvolveParams *conv_params, int16_t alpha,
int16_t beta, int16_t gamma, int16_t delta) {
int32_t tmp[15 * 8];
- const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
- const int reduce_bits_horiz =
+ const int use_conv_params =
+ (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
+ int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
+ if (!use_conv_params &&
+ bd + WARPEDPIXEL_FILTER_BITS + 2 - reduce_bits_horiz > 16)
+ reduce_bits_horiz += bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 14;
+ const int reduce_bits_vert =
+ use_conv_params ? conv_params->round_1
+ : 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz;
const int max_bits_horiz =
- use_conv_params
- ? bd + FILTER_BITS + 1 - conv_params->round_0
- : bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
+ use_conv_params ? bd + FILTER_BITS + 1 - conv_params->round_0
+ : bd + WARPEDPIXEL_FILTER_BITS + 1 - reduce_bits_horiz;
const int offset_bits_horiz =
use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
const int offset_bits_vert =
- use_conv_params
- ? bd + 2 * FILTER_BITS - conv_params->round_0
- : bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
+ use_conv_params ? bd + 2 * FILTER_BITS - conv_params->round_0
+ : bd + 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz;
if (use_conv_params) {
conv_params->do_post_rounding = 1;
}
@@ -534,7 +539,7 @@
} else {
uint16_t *p =
&pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
- sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
+ sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
assert(0 <= sum && sum < (1 << (bd + 2)));
uint16_t px =
clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
@@ -719,9 +724,13 @@
int16_t gamma, int16_t delta) {
int32_t tmp[15 * 8];
const int bd = 8;
- const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
+ const int use_conv_params =
+ (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
const int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
+ const int reduce_bits_vert =
+ use_conv_params ? conv_params->round_1
+ : 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz;
const int max_bits_horiz =
use_conv_params
? bd + FILTER_BITS + 1 - conv_params->round_0
@@ -837,7 +846,7 @@
} else {
uint8_t *p =
&pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
- sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
+ sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
assert(0 <= sum && sum < (1 << (bd + 2)));
uint8_t px = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
if (conv_params->do_average)
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index 3692b60..2843a91 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -140,7 +140,7 @@
void av1_convolve_rounding_avx2(const int32_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
int bits) {
- const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
+ const __m256i rnd_num = _mm256_set1_epi32((int32_t)((1 << bits) >> 1));
const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
if (w > 64) { // width = 128
@@ -283,7 +283,7 @@
uint8_t *dst8, int dst_stride, int w,
int h, int bits, int bd) {
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
+ const __m256i rnd_num = _mm256_set1_epi32((int32_t)((1 << bits) >> 1));
const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
if (w > 64) { // width = 128
diff --git a/av1/common/x86/highbd_warp_plane_sse4.c b/av1/common/x86/highbd_warp_plane_sse4.c
index 0cd438a..e89ad8b 100644
--- a/av1/common/x86/highbd_warp_plane_sse4.c
+++ b/av1/common/x86/highbd_warp_plane_sse4.c
@@ -22,15 +22,18 @@
ConvolveParams *conv_params, int16_t alpha,
int16_t beta, int16_t gamma, int16_t delta) {
int comp_avg = conv_params->do_average;
-#if HORSHEAR_REDUCE_PREC_BITS >= 5
__m128i tmp[15];
-#else
-#error "HORSHEAR_REDUCE_PREC_BITS < 5 not currently supported by SSSE3 filter"
-#endif
int i, j, k;
- const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
- const int reduce_bits_horiz =
+ const int use_conv_params =
+ (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
+ int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
+ if (!use_conv_params &&
+ bd + WARPEDPIXEL_FILTER_BITS + 2 - reduce_bits_horiz > 16)
+ reduce_bits_horiz += bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 14;
+ const int reduce_bits_vert =
+ use_conv_params ? conv_params->round_1
+ : 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz;
const int offset_bits_horiz =
use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
if (use_conv_params) {
@@ -91,10 +94,9 @@
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
- (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
- 1)) +
+ (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
ref[iy * stride] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
}
} else if (ix4 >= width + 6) {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -104,10 +106,9 @@
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
- (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
- 1)) +
+ (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
ref[iy * stride + (width - 1)] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
}
} else {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -361,13 +362,13 @@
} else {
// Round and pack into 8 bits
const __m128i round_const =
- _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
- ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
+ _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+ ((1 << reduce_bits_vert) >> 1));
const __m128i res_lo_round = _mm_srai_epi32(
- _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
+ _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
const __m128i res_hi_round = _mm_srai_epi32(
- _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
+ _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
__m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
// Clamp res_16bit to the range [0, 2^bd - 1]
diff --git a/av1/common/x86/highbd_warp_plane_ssse3.c b/av1/common/x86/highbd_warp_plane_ssse3.c
index dc727b6..e1d7f8e 100644
--- a/av1/common/x86/highbd_warp_plane_ssse3.c
+++ b/av1/common/x86/highbd_warp_plane_ssse3.c
@@ -22,21 +22,25 @@
ConvolveParams *conv_params, int16_t alpha,
int16_t beta, int16_t gamma, int16_t delta) {
int comp_avg = conv_params->do_average;
-#if HORSHEAR_REDUCE_PREC_BITS >= 5
__m128i tmp[15];
-#else
-#error "HORSHEAR_REDUCE_PREC_BITS < 5 not currently supported by SSSE3 filter"
-#endif
int i, j, k;
- const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
- const int reduce_bits_horiz =
+ const int use_conv_params =
+ (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
+ int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
+ if (!use_conv_params &&
+ bd + WARPEDPIXEL_FILTER_BITS + 2 - reduce_bits_horiz > 16)
+ reduce_bits_horiz += bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 14;
+ const int reduce_bits_vert =
+ use_conv_params ? conv_params->round_1
+ : 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz;
const int offset_bits_horiz =
use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
if (use_conv_params) {
conv_params->do_post_rounding = 1;
}
assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
+ if (bd == 12 && reduce_bits_horiz < 5) printf("Error\n");
/* Note: For this code to work, the left/right frame borders need to be
extended by at least 13 pixels each. By the time we get here, other
@@ -85,10 +89,9 @@
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
- (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
- 1)) +
+ (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
ref[iy * stride] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
}
} else if (ix4 >= width + 6) {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -98,10 +101,9 @@
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
- (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
- 1)) +
+ (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
ref[iy * stride + (width - 1)] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
}
} else {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -320,13 +322,13 @@
} else {
// Round and pack into 8 bits
const __m128i round_const =
- _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
- ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
+ _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+ ((1 << reduce_bits_vert) >> 1));
const __m128i res_lo_round = _mm_srai_epi32(
- _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
+ _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
const __m128i res_hi_round = _mm_srai_epi32(
- _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
+ _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
__m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
// Clamp res_16bit to the range [0, 2^bd - 1]
diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c
index 75ed82b..d330cd3 100644
--- a/av1/common/x86/warp_plane_sse2.c
+++ b/av1/common/x86/warp_plane_sse2.c
@@ -24,7 +24,8 @@
__m128i tmp[15];
int i, j, k;
const int bd = 8;
- const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
+ const int use_conv_params =
+ (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
const int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
const int offset_bits_horiz =
@@ -81,10 +82,9 @@
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
- (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
- 1)) +
+ (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
ref[iy * stride] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
}
} else if (ix4 >= width + 6) {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -94,10 +94,9 @@
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
- (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
- 1)) +
+ (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
ref[iy * stride + (width - 1)] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
}
} else {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
diff --git a/av1/common/x86/warp_plane_sse4.c b/av1/common/x86/warp_plane_sse4.c
index 2c97704..b421533 100644
--- a/av1/common/x86/warp_plane_sse4.c
+++ b/av1/common/x86/warp_plane_sse4.c
@@ -212,7 +212,8 @@
__m128i tmp[15];
int i, j, k;
const int bd = 8;
- const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
+ const int use_conv_params =
+ (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
const int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
const int offset_bits_horiz =
@@ -275,10 +276,9 @@
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
- (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
- 1)) +
+ (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
ref[iy * stride] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
}
} else if (ix4 >= width + 6) {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -288,10 +288,9 @@
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
- (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
- 1)) +
+ (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
ref[iy * stride + (width - 1)] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
}
} else {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
diff --git a/av1/common/x86/warp_plane_ssse3.c b/av1/common/x86/warp_plane_ssse3.c
index b0501e9..f18dad1 100644
--- a/av1/common/x86/warp_plane_ssse3.c
+++ b/av1/common/x86/warp_plane_ssse3.c
@@ -211,7 +211,8 @@
__m128i tmp[15];
int i, j, k;
const int bd = 8;
- const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
+ const int use_conv_params =
+ (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
const int reduce_bits_horiz =
use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
const int offset_bits_horiz =
@@ -268,10 +269,9 @@
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
- (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
- 1)) +
+ (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
ref[iy * stride] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
}
} else if (ix4 >= width + 6) {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -281,10 +281,9 @@
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
- (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
- 1)) +
+ (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
ref[iy * stride + (width - 1)] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
}
} else {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
diff --git a/test/av1_convolve_optimz_test.cc b/test/av1_convolve_optimz_test.cc
index 288daeb..0900aa1 100644
--- a/test/av1_convolve_optimz_test.cc
+++ b/test/av1_convolve_optimz_test.cc
@@ -66,7 +66,7 @@
subpel_ = GET_PARAM(4);
int ref = GET_PARAM(5);
const int plane = 0;
- conv_params_ = get_conv_params(ref, ref, plane, 8);
+ conv_params_ = get_conv_params_round(ref, ref, plane, 8);
alloc_ = new uint8_t[maxBlockSize * 4];
src_ = alloc_ + (vertiOffset * maxWidth);
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index cb58289..397fc70 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -149,7 +149,7 @@
TEST_P(Av1ConvolveTest, av1_convolve_vert) {
const int y_step_q4 = 16;
- ConvolveParams conv_params = get_conv_params(0, 0, 0, 8);
+ ConvolveParams conv_params = get_conv_params_round(0, 0, 0, 8);
int in_stride, out_stride, ref_out_stride, avg_out_stride, ref_avg_out_stride;
uint8_t *in = add_input(MAX_SB_SIZE, MAX_SB_SIZE, &in_stride);
@@ -202,7 +202,7 @@
TEST_P(Av1ConvolveTest, av1_convolve_horiz) {
const int x_step_q4 = 16;
- ConvolveParams conv_params = get_conv_params(0, 0, 0, 8);
+ ConvolveParams conv_params = get_conv_params_round(0, 0, 0, 8);
int in_stride, out_stride, ref_out_stride, avg_out_stride, ref_avg_out_stride;
uint8_t *in = add_input(MAX_SB_SIZE, MAX_SB_SIZE, &in_stride);