Replace division in warped motion least squares Replaces the int64 and int32 divisions in least-squares and gamma or delta computation with a mechanism that decomposes the divisor D such that 1/D = y * 2^-k where y is obtained from a lookup table indexed by 8 highest bits of the difference D - 2^floor(log2(D)). The main complexity is now only from computing this decomposition, which is essentially equivalent to finding floor(log2(D)) (position of highest bit in a 64-bit integer). Also includes an out of memory bug fix and some cleanups. Change-Id: I9247fdff5f6b4191175d4b4656357bfff626f02c
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c index a84c3c9..a25e9e6 100644 --- a/av1/common/warped_motion.c +++ b/av1/common/warped_motion.c
@@ -479,7 +479,7 @@ // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS. // We need an extra 2 taps to fit this in, for a total of 8 taps. /* clang-format off */ -const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3][8] = { +const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = { // [-1, 0) { 0, 0, 128, 0, 0, 0, 0, 0 }, { 0, - 1, 128, 2, - 1, 0, 0, 0 }, { 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 }, @@ -581,9 +581,122 @@ { 0, 0, 1, - 4, 13, 124, - 7, 1 }, { 0, 0, 1, - 4, 11, 125, - 6, 1 }, { 0, 0, 1, - 3, 8, 126, - 5, 1 }, { 0, 0, 1, - 2, 6, 126, - 4, 1 }, { 0, 0, 0, - 1, 4, 127, - 3, 1 }, { 0, 0, 0, - 1, 2, 128, - 1, 0 }, + + // dummy + { 0, 0, 0, 0, 0, 128, 0, 0 }, }; /* clang-format on */ +// Whether to use approximate divisor +#define APPROXIMATE_DIVISOR 1 + +#if APPROXIMATE_DIVISOR +#define DIV_LUT_PREC_BITS 14 +#define DIV_LUT_BITS 8 +#define DIV_LUT_NUM (1 << DIV_LUT_BITS) + +static const uint16_t div_lut[DIV_LUT_NUM + 1] = { + 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768, + 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142, + 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564, + 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028, + 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530, + 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066, + 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633, + 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228, + 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848, + 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491, + 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155, + 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838, + 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538, + 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255, + 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986, + 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732, + 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489, + 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259, + 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039, + 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830, + 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630, + 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439, + 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257, + 8240, 8224, 8208, 8192, +}; + +#if CONFIG_WARPED_MOTION +// Decomposes a divisor D such that 1/D = y/2^shift, where y is returned +// at precision of DIV_LUT_PREC_BITS along with the shift. +static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) { + int64_t e, f; + *shift = (D >> 32) ? get_msb(D >> 32) + 32 : get_msb(D); + // e is obtained from D after resetting the most significant 1 bit. + e = D - ((uint64_t)1 << *shift); + // Get the most significant DIV_LUT_BITS (8) bits of e into f + if (*shift > DIV_LUT_BITS) + f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS); + else + f = e << (DIV_LUT_BITS - *shift); + assert(f <= DIV_LUT_NUM); + *shift += DIV_LUT_PREC_BITS; + // Use f as lookup into the precomputed table of multipliers + return div_lut[f]; +} +#endif // CONFIG_WARPED_MOTION + +static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) { + int32_t e, f; + *shift = get_msb(D); + // e is obtained from D after resetting the most significant 1 bit. + e = D - ((uint32_t)1 << *shift); + // Get the most significant DIV_LUT_BITS (8) bits of e into f + if (*shift > DIV_LUT_BITS) + f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS); + else + f = e << (DIV_LUT_BITS - *shift); + assert(f <= DIV_LUT_NUM); + *shift += DIV_LUT_PREC_BITS; + // Use f as lookup into the precomputed table of multipliers + return div_lut[f]; +} +#endif // APPROXIMATE_DIVISOR + +static int is_affine_valid(WarpedMotionParams *wm) { + const int32_t *mat = wm->wmmat; + return (mat[2] > 0); +} + +static int is_affine_shear_allowed(int32_t alpha, int32_t beta, int32_t gamma, + int32_t delta) { + if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) || + (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS))) + return 0; + else + return 1; +} + +// Returns 1 on success or 0 on an invalid affine set +static int get_shear_params(WarpedMotionParams *wm, int32_t *alpha, + int32_t *beta, int32_t *gamma, int32_t *delta) { + const int32_t *mat = wm->wmmat; + if (!is_affine_valid(wm)) return 0; + *alpha = mat[2] - (1 << WARPEDMODEL_PREC_BITS); + *beta = mat[3]; +#if APPROXIMATE_DIVISOR + int16_t shift; + int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1); + int64_t v; + v = ((int64_t)mat[4] << WARPEDMODEL_PREC_BITS) * y; + *gamma = ROUND_POWER_OF_TWO_SIGNED_64(v, shift); + v = ((int64_t)mat[3] * mat[4]) * y; + *delta = mat[5] - ROUND_POWER_OF_TWO_SIGNED_64(v, shift) - + (1 << WARPEDMODEL_PREC_BITS); +#else + *gamma = ((int64_t)mat[4] << WARPEDMODEL_PREC_BITS) / mat[2]; + *delta = mat[5] - (((int64_t)mat[3] * mat[4] + (mat[2] / 2)) / mat[2]) - + (1 << WARPEDMODEL_PREC_BITS); +#endif // APPROXIMATE_DIVISOR + return 1; +} + #if CONFIG_AOM_HIGHBITDEPTH static INLINE void highbd_get_subcolumn(int taps, uint16_t *ref, int32_t *col, int stride, int x, int y_start) { @@ -760,31 +873,15 @@ int i, j, k, l, m; int32_t *mat = wm->wmmat; int32_t alpha, beta, gamma, delta; - - if (mat[2] == 0) { - // assert(0 && - // "Warped motion model is incompatible with new warp filter"); - highbd_warp_plane_old(wm, ref8, width, height, stride, pred8, p_col, - p_row, p_width, p_height, p_stride, subsampling_x, - subsampling_y, x_scale, y_scale, bd, ref_frm); - return; - } - - alpha = mat[2] - (1 << WARPEDMODEL_PREC_BITS); - beta = mat[3]; - gamma = ((int64_t)mat[4] << WARPEDMODEL_PREC_BITS) / mat[2]; - delta = mat[5] - (((int64_t)mat[3] * mat[4] + (mat[2] / 2)) / mat[2]) - - (1 << WARPEDMODEL_PREC_BITS); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - if ((4 * abs(alpha) + 7 * abs(beta) > (1 << WARPEDMODEL_PREC_BITS)) || - (4 * abs(gamma) + 4 * abs(delta) > (1 << WARPEDMODEL_PREC_BITS))) { - // assert(0 && - // "Warped motion model is incompatible with new warp filter"); - highbd_warp_plane_old(wm, ref8, width, height, stride, pred8, p_col, - p_row, p_width, p_height, p_stride, subsampling_x, - subsampling_y, x_scale, y_scale, bd, ref_frm); + if (!get_shear_params(wm, &alpha, &beta, &gamma, &delta)) { + assert(0 && "Warped motion model is incompatible with shear warp filter"); + return; + } + if (!is_affine_shear_allowed(alpha, beta, gamma, delta)) { + assert(0 && "Warped motion model is incompatible with shear warp filter"); return; } @@ -1058,10 +1155,11 @@ for (l = -4; l < 4; ++l) { int ix = ix4 + l - 3; // At this point, sx = sx4 + alpha * l + beta * k - const int16_t *coeffs = - warped_filter[ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + - WARPEDPIXEL_PREC_SHIFTS]; + const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + + WARPEDPIXEL_PREC_SHIFTS; + const int16_t *coeffs = warped_filter[offs]; int32_t sum = 0; + // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); for (m = 0; m < 8; ++m) { sum += ref[iy * stride + ix + m] * coeffs[m]; } @@ -1078,10 +1176,11 @@ uint8_t *p = &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; // At this point, sy = sy4 + gamma * l + delta * k - const int16_t *coeffs = - warped_filter[ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + - WARPEDPIXEL_PREC_SHIFTS]; + const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + + WARPEDPIXEL_PREC_SHIFTS; + const int16_t *coeffs = warped_filter[offs]; int32_t sum = 0; + // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); for (m = 0; m < 8; ++m) { sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m]; } @@ -1112,31 +1211,16 @@ int32_t *mat = wm->wmmat; int32_t alpha, beta, gamma, delta; - if (mat[2] == 0) { - // assert(0 && - // "Warped motion model is incompatible with new warp filter"); - warp_plane_old(wm, ref, width, height, stride, pred, p_col, p_row, - p_width, p_height, p_stride, subsampling_x, subsampling_y, - x_scale, y_scale, ref_frm); + if (!get_shear_params(wm, &alpha, &beta, &gamma, &delta)) { + assert(0 && "Warped motion model is incompatible with shear warp filter"); + return; + } + if (!is_affine_shear_allowed(alpha, beta, gamma, delta)) { + assert(0 && "Warped motion model is incompatible with shear warp filter"); return; } - alpha = mat[2] - (1 << WARPEDMODEL_PREC_BITS); - beta = mat[3]; - gamma = ((int64_t)mat[4] << WARPEDMODEL_PREC_BITS) / mat[2]; - delta = mat[5] - (((int64_t)mat[3] * mat[4] + (mat[2] / 2)) / mat[2]) - - (1 << WARPEDMODEL_PREC_BITS); - - if ((4 * abs(alpha) + 7 * abs(beta) > (1 << WARPEDMODEL_PREC_BITS)) || - (4 * abs(gamma) + 4 * abs(delta) > (1 << WARPEDMODEL_PREC_BITS))) { - // assert(0 && - // "Warped motion model is incompatible with new warp filter"); - warp_plane_old(wm, ref, width, height, stride, pred, p_col, p_row, - p_width, p_height, p_stride, subsampling_x, subsampling_y, - x_scale, y_scale, ref_frm); - return; - } - + // printf("%d %d %d %d\n", mat[2], mat[3], mat[4], mat[5]); av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width, p_height, p_stride, subsampling_x, subsampling_y, ref_frm, alpha, beta, gamma, delta); @@ -1215,10 +1299,13 @@ #if CONFIG_WARPED_MOTION -#define IDET_PREC_BITS 48 #define LEAST_SQUARES_SAMPLES_MAX 32 #define LEAST_SQUARES_MV_MAX 1024 // max mv in 1/8-pel + +#ifndef APPROXIMATE_DIVISOR +#define IDET_PREC_BITS 48 #define IDET_WARPEDMODEL_DIFF_BITS (IDET_PREC_BITS - WARPEDMODEL_PREC_BITS) +#endif // APPROXIMATE_DIVISOR static int find_affine_int(const int np, int *pts1, int *pts2, WarpedMotionParams *wm, int mi_row, int mi_col) { @@ -1229,7 +1316,7 @@ int64_t C00, C01, C02, C11, C12, C22; int64_t Px[3], Py[3]; - int64_t Det, iDet, v; + int64_t Det, v; // Offsets to make the values in the arrays smaller const int ux = mi_col * MI_SIZE * 8, uy = mi_row * MI_SIZE * 8; @@ -1305,32 +1392,42 @@ Py[1] = C01 * By[0] + C11 * By[1] + C12 * By[2]; Py[2] = C02 * By[0] + C12 * By[1] + C22 * By[2]; + int16_t shift; + int64_t iDet; +#if APPROXIMATE_DIVISOR + iDet = resolve_divisor_64(labs(Det), &shift) * (Det < 0 ? -1 : 1); + if (shift > WARPEDMODEL_PREC_BITS) { + shift -= WARPEDMODEL_PREC_BITS; + } else { + iDet <<= WARPEDMODEL_PREC_BITS; + } +#else // Compute inverse of the Determinant - // TODO(debargha, yuec): Try to remove this only division if possible iDet = ((int64_t)1 << IDET_PREC_BITS) / Det; + shift = IDET_WARPEDMODEL_DIFF_BITS; +#endif // APPROXIMATE_DIVISOR v = Px[0] * iDet; - wm->wmmat[2] = ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS); + wm->wmmat[2] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift); v = Px[1] * iDet; - wm->wmmat[3] = ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS); + wm->wmmat[3] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift); v = Px[2] * iDet; - wm->wmmat[0] = - ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS + 3); + wm->wmmat[0] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift + 3); // Adjust x displacement for the offset off = (ux << WARPEDMODEL_PREC_BITS) - ux * wm->wmmat[2] - uy * wm->wmmat[3]; wm->wmmat[0] += ROUND_POWER_OF_TWO_SIGNED(off, 3); v = Py[0] * iDet; - wm->wmmat[4] = ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS); + wm->wmmat[4] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift); v = Py[1] * iDet; - wm->wmmat[5] = ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS); + wm->wmmat[5] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift); v = Py[2] * iDet; - wm->wmmat[1] = - ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS + 3); + wm->wmmat[1] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift + 3); // Adjust y displacement for the offset off = (uy << WARPEDMODEL_PREC_BITS) - ux * wm->wmmat[4] - uy * wm->wmmat[5]; wm->wmmat[1] += ROUND_POWER_OF_TWO_SIGNED(off, 3); wm->wmmat[6] = wm->wmmat[7] = 0; + return 0; } @@ -1350,21 +1447,9 @@ } if (wm_params->wmtype == AFFINE || wm_params->wmtype == ROTZOOM) { // check compatibility with the fast warp filter - int32_t *mat = wm_params->wmmat; int32_t alpha, beta, gamma, delta; - - if (mat[2] == 0) return 1; - - alpha = mat[2] - (1 << WARPEDMODEL_PREC_BITS); - beta = mat[3]; - gamma = ((int64_t)mat[4] << WARPEDMODEL_PREC_BITS) / mat[2]; - delta = mat[5] - (((int64_t)mat[3] * mat[4] + (mat[2] / 2)) / mat[2]) - - (1 << WARPEDMODEL_PREC_BITS); - - if ((4 * abs(alpha) + 7 * abs(beta) > (1 << WARPEDMODEL_PREC_BITS)) || - (4 * abs(gamma) + 4 * abs(delta) > (1 << WARPEDMODEL_PREC_BITS))) { - return 1; - } + if (!get_shear_params(wm_params, &alpha, &beta, &gamma, &delta)) return 1; + if (!is_affine_shear_allowed(alpha, beta, gamma, delta)) return 1; } }
diff --git a/av1/common/warped_motion.h b/av1/common/warped_motion.h index cc3d019..4ac7f2c 100644 --- a/av1/common/warped_motion.h +++ b/av1/common/warped_motion.h
@@ -30,7 +30,7 @@ #define DEFAULT_WMTYPE AFFINE #endif // CONFIG_WARPED_MOTION -const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3][8]; +const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]; typedef void (*ProjectPointsFunc)(int32_t *mat, int *points, int *proj, const int n, const int stride_points,