Replace division in warped motion least squares
Replaces the int64 and int32 divisions in least-squares and
gamma or delta computation with a mechanism that decomposes
the divisor D such that 1/D = y * 2^-k where y is obtained
from a lookup table indexed by 8 highest bits of the difference
D - 2^floor(log2(D)). The main complexity is now only from
computing this decomposition, which is essentially equivalent
to finding floor(log2(D)) (position of highest
bit in a 64-bit integer).
Also includes an out of memory bug fix and some cleanups.
Change-Id: I9247fdff5f6b4191175d4b4656357bfff626f02c
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index a84c3c9..a25e9e6 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -479,7 +479,7 @@
// [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
// We need an extra 2 taps to fit this in, for a total of 8 taps.
/* clang-format off */
-const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3][8] = {
+const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
// [-1, 0)
{ 0, 0, 128, 0, 0, 0, 0, 0 }, { 0, - 1, 128, 2, - 1, 0, 0, 0 },
{ 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 },
@@ -581,9 +581,122 @@
{ 0, 0, 1, - 4, 13, 124, - 7, 1 }, { 0, 0, 1, - 4, 11, 125, - 6, 1 },
{ 0, 0, 1, - 3, 8, 126, - 5, 1 }, { 0, 0, 1, - 2, 6, 126, - 4, 1 },
{ 0, 0, 0, - 1, 4, 127, - 3, 1 }, { 0, 0, 0, - 1, 2, 128, - 1, 0 },
+
+ // dummy
+ { 0, 0, 0, 0, 0, 128, 0, 0 },
};
/* clang-format on */
+// Whether to use approximate divisor
+#define APPROXIMATE_DIVISOR 1
+
+#if APPROXIMATE_DIVISOR
+#define DIV_LUT_PREC_BITS 14
+#define DIV_LUT_BITS 8
+#define DIV_LUT_NUM (1 << DIV_LUT_BITS)
+
+static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
+ 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
+ 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
+ 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
+ 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
+ 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
+ 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
+ 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
+ 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
+ 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
+ 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
+ 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
+ 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
+ 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
+ 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
+ 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
+ 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732,
+ 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489,
+ 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259,
+ 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039,
+ 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830,
+ 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630,
+ 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439,
+ 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257,
+ 8240, 8224, 8208, 8192,
+};
+
+#if CONFIG_WARPED_MOTION
+// Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
+// at precision of DIV_LUT_PREC_BITS along with the shift.
+static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
+ int64_t e, f;
+ *shift = (D >> 32) ? get_msb(D >> 32) + 32 : get_msb(D);
+ // e is obtained from D after resetting the most significant 1 bit.
+ e = D - ((uint64_t)1 << *shift);
+ // Get the most significant DIV_LUT_BITS (8) bits of e into f
+ if (*shift > DIV_LUT_BITS)
+ f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
+ else
+ f = e << (DIV_LUT_BITS - *shift);
+ assert(f <= DIV_LUT_NUM);
+ *shift += DIV_LUT_PREC_BITS;
+ // Use f as lookup into the precomputed table of multipliers
+ return div_lut[f];
+}
+#endif // CONFIG_WARPED_MOTION
+
+static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
+ int32_t e, f;
+ *shift = get_msb(D);
+ // e is obtained from D after resetting the most significant 1 bit.
+ e = D - ((uint32_t)1 << *shift);
+ // Get the most significant DIV_LUT_BITS (8) bits of e into f
+ if (*shift > DIV_LUT_BITS)
+ f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
+ else
+ f = e << (DIV_LUT_BITS - *shift);
+ assert(f <= DIV_LUT_NUM);
+ *shift += DIV_LUT_PREC_BITS;
+ // Use f as lookup into the precomputed table of multipliers
+ return div_lut[f];
+}
+#endif // APPROXIMATE_DIVISOR
+
+static int is_affine_valid(WarpedMotionParams *wm) {
+ const int32_t *mat = wm->wmmat;
+ return (mat[2] > 0);
+}
+
+static int is_affine_shear_allowed(int32_t alpha, int32_t beta, int32_t gamma,
+ int32_t delta) {
+ if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
+ (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
+ return 0;
+ else
+ return 1;
+}
+
+// Returns 1 on success or 0 on an invalid affine set
+static int get_shear_params(WarpedMotionParams *wm, int32_t *alpha,
+ int32_t *beta, int32_t *gamma, int32_t *delta) {
+ const int32_t *mat = wm->wmmat;
+ if (!is_affine_valid(wm)) return 0;
+ *alpha = mat[2] - (1 << WARPEDMODEL_PREC_BITS);
+ *beta = mat[3];
+#if APPROXIMATE_DIVISOR
+ int16_t shift;
+ int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
+ int64_t v;
+ v = ((int64_t)mat[4] << WARPEDMODEL_PREC_BITS) * y;
+ *gamma = ROUND_POWER_OF_TWO_SIGNED_64(v, shift);
+ v = ((int64_t)mat[3] * mat[4]) * y;
+ *delta = mat[5] - ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
+ (1 << WARPEDMODEL_PREC_BITS);
+#else
+ *gamma = ((int64_t)mat[4] << WARPEDMODEL_PREC_BITS) / mat[2];
+ *delta = mat[5] - (((int64_t)mat[3] * mat[4] + (mat[2] / 2)) / mat[2]) -
+ (1 << WARPEDMODEL_PREC_BITS);
+#endif // APPROXIMATE_DIVISOR
+ return 1;
+}
+
#if CONFIG_AOM_HIGHBITDEPTH
static INLINE void highbd_get_subcolumn(int taps, uint16_t *ref, int32_t *col,
int stride, int x, int y_start) {
@@ -760,31 +873,15 @@
int i, j, k, l, m;
int32_t *mat = wm->wmmat;
int32_t alpha, beta, gamma, delta;
-
- if (mat[2] == 0) {
- // assert(0 &&
- // "Warped motion model is incompatible with new warp filter");
- highbd_warp_plane_old(wm, ref8, width, height, stride, pred8, p_col,
- p_row, p_width, p_height, p_stride, subsampling_x,
- subsampling_y, x_scale, y_scale, bd, ref_frm);
- return;
- }
-
- alpha = mat[2] - (1 << WARPEDMODEL_PREC_BITS);
- beta = mat[3];
- gamma = ((int64_t)mat[4] << WARPEDMODEL_PREC_BITS) / mat[2];
- delta = mat[5] - (((int64_t)mat[3] * mat[4] + (mat[2] / 2)) / mat[2]) -
- (1 << WARPEDMODEL_PREC_BITS);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- if ((4 * abs(alpha) + 7 * abs(beta) > (1 << WARPEDMODEL_PREC_BITS)) ||
- (4 * abs(gamma) + 4 * abs(delta) > (1 << WARPEDMODEL_PREC_BITS))) {
- // assert(0 &&
- // "Warped motion model is incompatible with new warp filter");
- highbd_warp_plane_old(wm, ref8, width, height, stride, pred8, p_col,
- p_row, p_width, p_height, p_stride, subsampling_x,
- subsampling_y, x_scale, y_scale, bd, ref_frm);
+ if (!get_shear_params(wm, &alpha, &beta, &gamma, &delta)) {
+ assert(0 && "Warped motion model is incompatible with shear warp filter");
+ return;
+ }
+ if (!is_affine_shear_allowed(alpha, beta, gamma, delta)) {
+ assert(0 && "Warped motion model is incompatible with shear warp filter");
return;
}
@@ -1058,10 +1155,11 @@
for (l = -4; l < 4; ++l) {
int ix = ix4 + l - 3;
// At this point, sx = sx4 + alpha * l + beta * k
- const int16_t *coeffs =
- warped_filter[ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
- WARPEDPIXEL_PREC_SHIFTS];
+ const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS;
+ const int16_t *coeffs = warped_filter[offs];
int32_t sum = 0;
+ // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for (m = 0; m < 8; ++m) {
sum += ref[iy * stride + ix + m] * coeffs[m];
}
@@ -1078,10 +1176,11 @@
uint8_t *p =
&pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
// At this point, sy = sy4 + gamma * l + delta * k
- const int16_t *coeffs =
- warped_filter[ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
- WARPEDPIXEL_PREC_SHIFTS];
+ const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS;
+ const int16_t *coeffs = warped_filter[offs];
int32_t sum = 0;
+ // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for (m = 0; m < 8; ++m) {
sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
}
@@ -1112,31 +1211,16 @@
int32_t *mat = wm->wmmat;
int32_t alpha, beta, gamma, delta;
- if (mat[2] == 0) {
- // assert(0 &&
- // "Warped motion model is incompatible with new warp filter");
- warp_plane_old(wm, ref, width, height, stride, pred, p_col, p_row,
- p_width, p_height, p_stride, subsampling_x, subsampling_y,
- x_scale, y_scale, ref_frm);
+ if (!get_shear_params(wm, &alpha, &beta, &gamma, &delta)) {
+ assert(0 && "Warped motion model is incompatible with shear warp filter");
+ return;
+ }
+ if (!is_affine_shear_allowed(alpha, beta, gamma, delta)) {
+ assert(0 && "Warped motion model is incompatible with shear warp filter");
return;
}
- alpha = mat[2] - (1 << WARPEDMODEL_PREC_BITS);
- beta = mat[3];
- gamma = ((int64_t)mat[4] << WARPEDMODEL_PREC_BITS) / mat[2];
- delta = mat[5] - (((int64_t)mat[3] * mat[4] + (mat[2] / 2)) / mat[2]) -
- (1 << WARPEDMODEL_PREC_BITS);
-
- if ((4 * abs(alpha) + 7 * abs(beta) > (1 << WARPEDMODEL_PREC_BITS)) ||
- (4 * abs(gamma) + 4 * abs(delta) > (1 << WARPEDMODEL_PREC_BITS))) {
- // assert(0 &&
- // "Warped motion model is incompatible with new warp filter");
- warp_plane_old(wm, ref, width, height, stride, pred, p_col, p_row,
- p_width, p_height, p_stride, subsampling_x, subsampling_y,
- x_scale, y_scale, ref_frm);
- return;
- }
-
+ // printf("%d %d %d %d\n", mat[2], mat[3], mat[4], mat[5]);
av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
p_width, p_height, p_stride, subsampling_x, subsampling_y,
ref_frm, alpha, beta, gamma, delta);
@@ -1215,10 +1299,13 @@
#if CONFIG_WARPED_MOTION
-#define IDET_PREC_BITS 48
#define LEAST_SQUARES_SAMPLES_MAX 32
#define LEAST_SQUARES_MV_MAX 1024 // max mv in 1/8-pel
+
+#ifndef APPROXIMATE_DIVISOR
+#define IDET_PREC_BITS 48
#define IDET_WARPEDMODEL_DIFF_BITS (IDET_PREC_BITS - WARPEDMODEL_PREC_BITS)
+#endif // APPROXIMATE_DIVISOR
static int find_affine_int(const int np, int *pts1, int *pts2,
WarpedMotionParams *wm, int mi_row, int mi_col) {
@@ -1229,7 +1316,7 @@
int64_t C00, C01, C02, C11, C12, C22;
int64_t Px[3], Py[3];
- int64_t Det, iDet, v;
+ int64_t Det, v;
// Offsets to make the values in the arrays smaller
const int ux = mi_col * MI_SIZE * 8, uy = mi_row * MI_SIZE * 8;
@@ -1305,32 +1392,42 @@
Py[1] = C01 * By[0] + C11 * By[1] + C12 * By[2];
Py[2] = C02 * By[0] + C12 * By[1] + C22 * By[2];
+ int16_t shift;
+ int64_t iDet;
+#if APPROXIMATE_DIVISOR
+ iDet = resolve_divisor_64(labs(Det), &shift) * (Det < 0 ? -1 : 1);
+ if (shift > WARPEDMODEL_PREC_BITS) {
+ shift -= WARPEDMODEL_PREC_BITS;
+ } else {
+ iDet <<= WARPEDMODEL_PREC_BITS;
+ }
+#else
// Compute inverse of the Determinant
- // TODO(debargha, yuec): Try to remove this only division if possible
iDet = ((int64_t)1 << IDET_PREC_BITS) / Det;
+ shift = IDET_WARPEDMODEL_DIFF_BITS;
+#endif // APPROXIMATE_DIVISOR
v = Px[0] * iDet;
- wm->wmmat[2] = ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS);
+ wm->wmmat[2] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift);
v = Px[1] * iDet;
- wm->wmmat[3] = ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS);
+ wm->wmmat[3] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift);
v = Px[2] * iDet;
- wm->wmmat[0] =
- ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS + 3);
+ wm->wmmat[0] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift + 3);
// Adjust x displacement for the offset
off = (ux << WARPEDMODEL_PREC_BITS) - ux * wm->wmmat[2] - uy * wm->wmmat[3];
wm->wmmat[0] += ROUND_POWER_OF_TWO_SIGNED(off, 3);
v = Py[0] * iDet;
- wm->wmmat[4] = ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS);
+ wm->wmmat[4] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift);
v = Py[1] * iDet;
- wm->wmmat[5] = ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS);
+ wm->wmmat[5] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift);
v = Py[2] * iDet;
- wm->wmmat[1] =
- ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS + 3);
+ wm->wmmat[1] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift + 3);
// Adjust y displacement for the offset
off = (uy << WARPEDMODEL_PREC_BITS) - ux * wm->wmmat[4] - uy * wm->wmmat[5];
wm->wmmat[1] += ROUND_POWER_OF_TWO_SIGNED(off, 3);
wm->wmmat[6] = wm->wmmat[7] = 0;
+
return 0;
}
@@ -1350,21 +1447,9 @@
}
if (wm_params->wmtype == AFFINE || wm_params->wmtype == ROTZOOM) {
// check compatibility with the fast warp filter
- int32_t *mat = wm_params->wmmat;
int32_t alpha, beta, gamma, delta;
-
- if (mat[2] == 0) return 1;
-
- alpha = mat[2] - (1 << WARPEDMODEL_PREC_BITS);
- beta = mat[3];
- gamma = ((int64_t)mat[4] << WARPEDMODEL_PREC_BITS) / mat[2];
- delta = mat[5] - (((int64_t)mat[3] * mat[4] + (mat[2] / 2)) / mat[2]) -
- (1 << WARPEDMODEL_PREC_BITS);
-
- if ((4 * abs(alpha) + 7 * abs(beta) > (1 << WARPEDMODEL_PREC_BITS)) ||
- (4 * abs(gamma) + 4 * abs(delta) > (1 << WARPEDMODEL_PREC_BITS))) {
- return 1;
- }
+ if (!get_shear_params(wm_params, &alpha, &beta, &gamma, &delta)) return 1;
+ if (!is_affine_shear_allowed(alpha, beta, gamma, delta)) return 1;
}
}