Prepare for vectorizing highbd warp filter
This applies the same refactorings to highbd_warp_plane
which were applied to warp_plane a while ago, and lays the
groundwork for the relevant tests.
Change-Id: Ic4c00bce1accc5a3624bba0c3b4b325e69a42c1a
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index b600c66..b900d4a 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -670,6 +670,10 @@
(aom_config("CONFIG_GLOBAL_MOTION") eq "yes")) {
add_proto qw/void av1_warp_affine/, "int32_t *mat, uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int ref_frm, int32_t alpha, int32_t beta, int32_t gamma, int32_t delta";
specialize qw/av1_warp_affine sse2/;
+
+ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void av1_highbd_warp_affine/, "int32_t *mat, uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, int ref_frm, int32_t alpha, int32_t beta, int32_t gamma, int32_t delta";
+ }
}
# LOOP_RESTORATION functions
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index 1aecd56..94cf5f9 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -618,6 +618,14 @@
8240, 8224, 8208, 8192,
};
+static inline int16_t saturate_int16(int32_t v) {
+ if (v > 32767)
+ return 32767;
+ else if (v < -32768)
+ return -32768;
+ return v;
+}
+
#if CONFIG_WARPED_MOTION
// Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
// at precision of DIV_LUT_PREC_BITS along with the shift.
@@ -848,6 +856,138 @@
// Note: For an explanation of the warp algorithm, see the comment
// above warp_plane()
+//
+// Note also: The "worst case" in terms of modulus of the data stored into 'tmp'
+// (ie, the result of 'sum' in the horizontal filter) occurs when:
+// coeffs = { -2, 8, -22, 87, 72, -21, 8, -2}, and
+// ref = { 0, 255, 0, 255, 255, 0, 255, 0}
+// Before rounding, this gives sum = 716625. After rounding,
+// HORSHEAR_REDUCE_PREC_BITS = 4 => sum = 44789 > 2^15
+// HORSHEAR_REDUCE_PREC_BITS = 5 => sum = 22395 < 2^15
+//
+// So, as long as HORSHEAR_REDUCE_PREC_BITS >= 5, we can safely use a 16-bit
+// intermediate array.
+void av1_highbd_warp_affine_c(int32_t *mat, uint16_t *ref, int width,
+ int height, int stride, uint16_t *pred, int p_col,
+ int p_row, int p_width, int p_height,
+ int p_stride, int subsampling_x,
+ int subsampling_y, int bd, int ref_frm,
+ int32_t alpha, int32_t beta, int32_t gamma,
+ int32_t delta) {
+#if HORSHEAR_REDUCE_PREC_BITS >= 5
+ int16_t tmp[15 * 8];
+#else
+ int32_t tmp[15 * 8];
+#endif
+ int i, j, k, l, m;
+
+ /* Note: For this code to work, the left/right frame borders need to be
+ extended by at least 13 pixels each. By the time we get here, other
+ code will have set up this border, but we allow an explicit check
+ for debugging purposes.
+ */
+ /*for (i = 0; i < height; ++i) {
+ for (j = 0; j < 13; ++j) {
+ assert(ref[i * stride - 13 + j] == ref[i * stride]);
+ assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+ }
+ }*/
+
+ for (i = p_row; i < p_row + p_height; i += 8) {
+ for (j = p_col; j < p_col + p_width; j += 8) {
+ int32_t x4, y4, ix4, sx4, iy4, sy4;
+ if (subsampling_x)
+ x4 = ROUND_POWER_OF_TWO_SIGNED(
+ mat[2] * 2 * (j + 4) + mat[3] * 2 * (i + 4) + mat[0] +
+ (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+ 1);
+ else
+ x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
+
+ if (subsampling_y)
+ y4 = ROUND_POWER_OF_TWO_SIGNED(
+ mat[4] * 2 * (j + 4) + mat[5] * 2 * (i + 4) + mat[1] +
+ (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+ 1);
+ else
+ y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
+
+ ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+ sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+ sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ // Horizontal filter
+ for (k = -7; k < 8; ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ if (ix4 <= -7) {
+ for (l = 0; l < 8; ++l) {
+ tmp[(k + 7) * 8 + l] =
+ ref[iy * stride] *
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
+ }
+ } else if (ix4 >= width + 6) {
+ for (l = 0; l < 8; ++l) {
+ tmp[(k + 7) * 8 + l] =
+ ref[iy * stride + (width - 1)] *
+ (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
+ }
+ } else {
+ int sx = sx4 + alpha * (-4) + beta * k;
+
+ for (l = -4; l < 4; ++l) {
+ int ix = ix4 + l - 3;
+ const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS;
+ const int16_t *coeffs = warped_filter[offs];
+ int32_t sum = 0;
+ // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+ for (m = 0; m < 8; ++m) {
+ sum += ref[iy * stride + ix + m] * coeffs[m];
+ }
+ sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
+#if HORSHEAR_REDUCE_PREC_BITS >= 5
+ tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum);
+#else
+ tmp[(k + 7) * 8 + (l + 4)] = sum;
+#endif
+ sx += alpha;
+ }
+ }
+ }
+
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+ int sy = sy4 + gamma * (-4) + delta * k;
+ for (l = -4; l < 4; ++l) {
+ uint16_t *p =
+ &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+ const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS;
+ const int16_t *coeffs = warped_filter[offs];
+ int32_t sum = 0;
+ // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+ for (m = 0; m < 8; ++m) {
+ sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
+ }
+ sum = clip_pixel_highbd(
+ ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS), bd);
+ if (ref_frm)
+ *p = ROUND_POWER_OF_TWO(*p + sum, 1);
+ else
+ *p = sum;
+ sy += gamma;
+ }
+ }
+ }
+ }
+}
+
static void highbd_warp_plane(WarpedMotionParams *wm, uint8_t *ref8, int width,
int height, int stride, uint8_t *pred8, int p_col,
int p_row, int p_width, int p_height,
@@ -858,91 +998,20 @@
wm->wmmat[5] = wm->wmmat[2];
wm->wmmat[4] = -wm->wmmat[3];
}
- if (wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) {
- int32_t tmp[15 * 8];
- int i, j, k, l, m;
+ if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) && x_scale == 16 &&
+ y_scale == 16) {
int32_t *mat = wm->wmmat;
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-
const int32_t alpha = wm->alpha;
const int32_t beta = wm->beta;
const int32_t gamma = wm->gamma;
const int32_t delta = wm->delta;
- for (i = p_row; i < p_row + p_height; i += 8) {
- for (j = p_col; j < p_col + p_width; j += 8) {
- int32_t x4, y4, ix4, sx4, iy4, sy4;
- if (subsampling_x)
- x4 = ROUND_POWER_OF_TWO_SIGNED(
- mat[2] * 2 * (j + 4) + mat[3] * 2 * (i + 4) + mat[0] +
- (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
- 1);
- else
- x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
-
- if (subsampling_y)
- y4 = ROUND_POWER_OF_TWO_SIGNED(
- mat[4] * 2 * (j + 4) + mat[5] * 2 * (i + 4) + mat[1] +
- (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
- 1);
- else
- y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
-
- ix4 = x4 >> WARPEDMODEL_PREC_BITS;
- sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
- iy4 = y4 >> WARPEDMODEL_PREC_BITS;
- sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
- // Horizontal filter
- for (k = -7; k < 8; ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
-
- for (l = -4; l < 4; ++l) {
- int ix = ix4 + l;
- int sx = ROUND_POWER_OF_TWO_SIGNED(sx4 + alpha * l + beta * k,
- WARPEDDIFF_PREC_BITS);
- const int16_t *coeffs = warped_filter[sx + WARPEDPIXEL_PREC_SHIFTS];
- int32_t sum = 0;
- for (m = 0; m < 8; ++m) {
- if (ix + m - 3 < 0)
- sum += ref[iy * stride] * coeffs[m];
- else if (ix + m - 3 > width - 1)
- sum += ref[iy * stride + width - 1] * coeffs[m];
- else
- sum += ref[iy * stride + ix + m - 3] * coeffs[m];
- }
- sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
- tmp[(k + 7) * 8 + (l + 4)] = sum;
- }
- }
-
- // Vertical filter
- for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
- for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
- uint16_t *p =
- &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
- int sy = ROUND_POWER_OF_TWO_SIGNED(sy4 + gamma * l + delta * k,
- WARPEDDIFF_PREC_BITS);
- const int16_t *coeffs = warped_filter[sy + WARPEDPIXEL_PREC_SHIFTS];
- int32_t sum = 0;
- for (m = 0; m < 8; ++m) {
- sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
- }
- sum = clip_pixel_highbd(
- ROUND_POWER_OF_TWO_SIGNED(sum, VERSHEAR_REDUCE_PREC_BITS), bd);
- if (ref_frm)
- *p = ROUND_POWER_OF_TWO_SIGNED(*p + sum, 1);
- else
- *p = sum;
- }
- }
- }
- }
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
+ p_width, p_height, p_stride, subsampling_x,
+ subsampling_y, bd, ref_frm, alpha, beta, gamma,
+ delta);
} else {
highbd_warp_plane_old(wm, ref8, width, height, stride, pred8, p_col, p_row,
p_width, p_height, p_stride, subsampling_x,
@@ -1048,14 +1117,6 @@
TODO(david.barker): Maybe support scaled references?
*/
-static inline int16_t saturate_int16(int32_t v) {
- if (v > 32767)
- return 32767;
- else if (v < -32768)
- return -32768;
- return v;
-}
-
void av1_warp_affine_c(int32_t *mat, uint8_t *ref, int width, int height,
int stride, uint8_t *pred, int p_col, int p_row,
int p_width, int p_height, int p_stride,
diff --git a/test/warp_filter_test.cc b/test/warp_filter_test.cc
index 1d36a3f..4eea6c3 100644
--- a/test/warp_filter_test.cc
+++ b/test/warp_filter_test.cc
@@ -16,6 +16,9 @@
using std::tr1::make_tuple;
using libaom_test::ACMRandom;
using libaom_test::AV1WarpFilter::AV1WarpFilterTest;
+#if CONFIG_AOM_HIGHBITDEPTH
+using libaom_test::AV1HighbdWarpFilter::AV1HighbdWarpFilterTest;
+#endif
namespace {
diff --git a/test/warp_filter_test_util.cc b/test/warp_filter_test_util.cc
index d0966b9..25d1e07 100644
--- a/test/warp_filter_test_util.cc
+++ b/test/warp_filter_test_util.cc
@@ -17,6 +17,10 @@
using libaom_test::ACMRandom;
using libaom_test::AV1WarpFilter::AV1WarpFilterTest;
using libaom_test::AV1WarpFilter::WarpTestParam;
+#if CONFIG_AOM_HIGHBITDEPTH
+using libaom_test::AV1HighbdWarpFilter::AV1HighbdWarpFilterTest;
+using libaom_test::AV1HighbdWarpFilter::HighbdWarpTestParam;
+#endif
::testing::internal::ParamGenerator<WarpTestParam>
libaom_test::AV1WarpFilter::GetDefaultParams() {
@@ -42,6 +46,7 @@
if ((rnd_.Rand8()) & 1) return -v;
return v;
}
+
void AV1WarpFilterTest::generate_model(int32_t *mat, int32_t *alpha,
int32_t *beta, int32_t *gamma,
int32_t *delta) {
@@ -73,7 +78,7 @@
(1 << WARPEDMODEL_PREC_BITS);
if ((4 * abs(*alpha) + 7 * abs(*beta) > (1 << WARPEDMODEL_PREC_BITS)) ||
- (4 * abs(*gamma) + 7 * abs(*delta) > (1 << WARPEDMODEL_PREC_BITS)))
+ (4 * abs(*gamma) + 4 * abs(*delta) > (1 << WARPEDMODEL_PREC_BITS)))
continue;
// We have a valid model, so finish
@@ -103,7 +108,6 @@
memset(input + i * stride + w, input[i * stride + (w - 1)], border);
}
- /* Try different sizes of prediction block */
for (i = 0; i < num_iters; ++i) {
for (sub_x = 0; sub_x < 2; ++sub_x)
for (sub_y = 0; sub_y < 2; ++sub_y) {
@@ -121,3 +125,122 @@
}
}
}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+::testing::internal::ParamGenerator<HighbdWarpTestParam>
+libaom_test::AV1HighbdWarpFilter::GetDefaultParams() {
+ const HighbdWarpTestParam defaultParams[] = {
+ make_tuple(4, 4, 50000, 8), make_tuple(8, 8, 50000, 8),
+ make_tuple(64, 64, 1000, 8), make_tuple(4, 16, 20000, 8),
+ make_tuple(32, 8, 10000, 8), make_tuple(4, 4, 50000, 10),
+ make_tuple(8, 8, 50000, 10), make_tuple(64, 64, 1000, 10),
+ make_tuple(4, 16, 20000, 10), make_tuple(32, 8, 10000, 10),
+ make_tuple(4, 4, 50000, 12), make_tuple(8, 8, 50000, 12),
+ make_tuple(64, 64, 1000, 12), make_tuple(4, 16, 20000, 12),
+ make_tuple(32, 8, 10000, 12),
+ };
+ return ::testing::ValuesIn(defaultParams);
+}
+
+AV1HighbdWarpFilterTest::~AV1HighbdWarpFilterTest() {}
+void AV1HighbdWarpFilterTest::SetUp() {
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1HighbdWarpFilterTest::TearDown() { libaom_test::ClearSystemState(); }
+
+int32_t AV1HighbdWarpFilterTest::random_param(int bits) {
+ // 1 in 8 chance of generating zero (arbitrarily chosen)
+ if (((rnd_.Rand8()) & 7) == 0) return 0;
+ // Otherwise, enerate uniform values in the range
+ // [-(1 << bits), 1] U [1, 1<<bits]
+ int32_t v = 1 + (rnd_.Rand16() & ((1 << bits) - 1));
+ if ((rnd_.Rand8()) & 1) return -v;
+ return v;
+}
+
+void AV1HighbdWarpFilterTest::generate_model(int32_t *mat, int32_t *alpha,
+ int32_t *beta, int32_t *gamma,
+ int32_t *delta) {
+ while (1) {
+ mat[0] = random_param(WARPEDMODEL_PREC_BITS + 6);
+ mat[1] = random_param(WARPEDMODEL_PREC_BITS + 6);
+ mat[2] = (random_param(WARPEDMODEL_PREC_BITS - 3)) +
+ (1 << WARPEDMODEL_PREC_BITS);
+ mat[3] = random_param(WARPEDMODEL_PREC_BITS - 3);
+ // 50/50 chance of generating ROTZOOM vs. AFFINE models
+ if (rnd_.Rand8() & 1) {
+ // AFFINE
+ mat[4] = random_param(WARPEDMODEL_PREC_BITS - 3);
+ mat[5] = (random_param(WARPEDMODEL_PREC_BITS - 3)) +
+ (1 << WARPEDMODEL_PREC_BITS);
+ } else {
+ mat[4] = -mat[3];
+ mat[5] = mat[2];
+ }
+
+ // Calculate the derived parameters and check that they are suitable
+ // for the warp filter.
+ assert(mat[2] != 0);
+
+ *alpha = mat[2] - (1 << WARPEDMODEL_PREC_BITS);
+ *beta = mat[3];
+ *gamma = ((int64_t)mat[4] << WARPEDMODEL_PREC_BITS) / mat[2];
+ *delta = mat[5] - (((int64_t)mat[3] * mat[4] + (mat[2] / 2)) / mat[2]) -
+ (1 << WARPEDMODEL_PREC_BITS);
+
+ if ((4 * abs(*alpha) + 7 * abs(*beta) > (1 << WARPEDMODEL_PREC_BITS)) ||
+ (4 * abs(*gamma) + 4 * abs(*delta) > (1 << WARPEDMODEL_PREC_BITS)))
+ continue;
+
+ // We have a valid model, so finish
+ return;
+ }
+}
+
+void AV1HighbdWarpFilterTest::RunCheckOutput(
+ highbd_warp_affine_func test_impl) {
+ const int w = 128, h = 128;
+ const int border = 16;
+ const int stride = w + 2 * border;
+ const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
+ const int num_iters = GET_PARAM(2);
+ const int bd = GET_PARAM(3);
+ const int mask = (1 << bd) - 1;
+ int i, j, sub_x, sub_y;
+
+ uint16_t *input_ = new uint16_t[h * stride];
+ uint16_t *input = input_ + border;
+ uint16_t *output = new uint16_t[out_w * out_h];
+ uint16_t *output2 = new uint16_t[out_w * out_h];
+ int32_t mat[8], alpha, beta, gamma, delta;
+
+ // Generate an input block and extend its borders horizontally
+ for (i = 0; i < h; ++i)
+ for (j = 0; j < w; ++j) input[i * stride + j] = rnd_.Rand16() & mask;
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < border; ++j) {
+ input[i * stride - border + j] = input[i * stride];
+ input[i * stride + w + j] = input[i * stride + (w - 1)];
+ }
+ }
+
+ for (i = 0; i < num_iters; ++i) {
+ for (sub_x = 0; sub_x < 2; ++sub_x)
+ for (sub_y = 0; sub_y < 2; ++sub_y) {
+ generate_model(mat, &alpha, &beta, &gamma, &delta);
+
+ av1_highbd_warp_affine_c(mat, input, w, h, stride, output, 32, 32,
+ out_w, out_h, out_w, sub_x, sub_y, bd, 0,
+ alpha, beta, gamma, delta);
+ test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
+ out_w, sub_x, sub_y, bd, 0, alpha, beta, gamma, delta);
+
+ for (j = 0; j < out_w * out_h; ++j)
+ ASSERT_EQ(output[j], output2[j])
+ << "Pixel mismatch at index " << j << " = (" << (j % out_w)
+ << ", " << (j / out_w) << ") on iteration " << i;
+ }
+ }
+}
+#endif // CONFIG_AOM_HIGHBITDEPTH
diff --git a/test/warp_filter_test_util.h b/test/warp_filter_test_util.h
index 69dd14b..facd8cb 100644
--- a/test/warp_filter_test_util.h
+++ b/test/warp_filter_test_util.h
@@ -56,6 +56,39 @@
} // namespace AV1WarpFilter
+#if CONFIG_AOM_HIGHBITDEPTH
+namespace AV1HighbdWarpFilter {
+typedef void (*highbd_warp_affine_func)(
+ int32_t *mat, uint16_t *ref, int width, int height, int stride,
+ uint16_t *pred, int p_col, int p_row, int p_width, int p_height,
+ int p_stride, int subsampling_x, int subsampling_y, int bd, int ref_frm,
+ int32_t alpha, int32_t beta, int32_t gamma, int32_t delta);
+
+typedef std::tr1::tuple<int, int, int, int> HighbdWarpTestParam;
+
+::testing::internal::ParamGenerator<HighbdWarpTestParam> GetDefaultParams();
+
+class AV1HighbdWarpFilterTest
+ : public ::testing::TestWithParam<HighbdWarpTestParam> {
+ public:
+ virtual ~AV1HighbdWarpFilterTest();
+ virtual void SetUp();
+
+ virtual void TearDown();
+
+ protected:
+ int32_t random_param(int bits);
+ void generate_model(int32_t *mat, int32_t *alpha, int32_t *beta,
+ int32_t *gamma, int32_t *delta);
+
+ void RunCheckOutput(highbd_warp_affine_func test_impl);
+
+ libaom_test::ACMRandom rnd_;
+};
+
+} // namespace AV1HighbdWarpFilter
+#endif // CONFIG_AOM_HIGHBITDEPTH
+
} // namespace libaom_test
#endif // TEST_WARP_FILTER_TEST_UTIL_H_