Reduce multiplier precision for warp least squares
Includes reordering and other clamping changes, as well as
changes to reduce multiplier precision.
cam_lowres (60 frames): -0.092% BDRATE improvement in
--disable-cdef --disable-global-motion --disable-ext-tx
configuation.
Change-Id: I0660c45b44fcd5a193534d8dadd1aa1ae5c5e27a
diff --git a/aom_dsp/aom_dsp_common.h b/aom_dsp/aom_dsp_common.h
index 7f12ee8..3807ae0 100644
--- a/aom_dsp/aom_dsp_common.h
+++ b/aom_dsp/aom_dsp_common.h
@@ -80,6 +80,10 @@
return value < low ? low : (value > high ? high : value);
}
+static INLINE int64_t clamp64(int64_t value, int64_t low, int64_t high) {
+ return value < low ? low : (value > high ? high : value);
+}
+
static INLINE double fclamp(double value, double low, double high) {
return value < low ? low : (value > high ? high : value);
}
diff --git a/av1/common/mv.h b/av1/common/mv.h
index 7fff182..9598319 100644
--- a/av1/common/mv.h
+++ b/av1/common/mv.h
@@ -44,9 +44,8 @@
#define WARPEDMODEL_ROW3HOMO_PREC_BITS 16
#define WARPEDMODEL_TRANS_CLAMP (128 << WARPEDMODEL_PREC_BITS)
-#define WARPEDMODEL_DIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS + 1))
-#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 1))
-#define WARPEDMODEL_ROW3HOMO_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 1))
+#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 3))
+#define WARPEDMODEL_ROW3HOMO_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 2))
// Bits of subpel precision for warped interpolation
#define WARPEDPIXEL_PREC_BITS 6
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index 8393288..637648a 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -1747,6 +1747,83 @@
#define LS_PRODUCT2(a, b) \
(((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> 2)
+#define USE_LIMITED_PREC_MULT 0
+
+#if USE_LIMITED_PREC_MULT
+
+#define MUL_PREC_BITS 16
+static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
+ int msb = 0;
+ uint16_t mult = 0;
+ *shift = 0;
+ if (D != 0) {
+ msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
+ : get_msb((unsigned int)D));
+ if (msb >= MUL_PREC_BITS) {
+ mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
+ *shift = msb + 1 - MUL_PREC_BITS;
+ } else {
+ mult = (uint16_t)D;
+ *shift = 0;
+ }
+ }
+ return mult;
+}
+
+static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
+ int32_t ret;
+ int16_t mshift;
+ uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
+ int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
+ shift -= mshift;
+ if (shift > 0) {
+ return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
+ -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+ WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+ } else {
+ return (int32_t)clamp(v * (1 << (-shift)),
+ -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+ WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+ }
+ return ret;
+}
+
+static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
+ int16_t mshift;
+ uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
+ int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
+ shift -= mshift;
+ if (shift > 0) {
+ return (int32_t)clamp(
+ ROUND_POWER_OF_TWO_SIGNED(v, shift),
+ (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+ (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+ } else {
+ return (int32_t)clamp(
+ v * (1 << (-shift)),
+ (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+ (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+ }
+}
+
+#else
+
+static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
+ int64_t v = Px * (int64_t)iDet;
+ return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
+ -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+ WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+}
+
+static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
+ int64_t v = Px * (int64_t)iDet;
+ return (int32_t)clamp64(
+ ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
+ (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+ (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+}
+#endif // USE_LIMITED_PREC_MULT
+
static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize,
int mvy, int mvx, WarpedMotionParams *wm, int mi_row,
int mi_col) {
@@ -1757,8 +1834,10 @@
const int bw = block_size_wide[bsize];
const int bh = block_size_high[bsize];
- const int suy = (mi_row * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1) * 8;
- const int sux = (mi_col * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1) * 8;
+ const int isuy = (mi_row * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1);
+ const int isux = (mi_col * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1);
+ const int suy = isuy * 8;
+ const int sux = isux * 8;
const int duy = suy + mvy;
const int dux = sux + mvx;
@@ -1845,38 +1924,27 @@
shift = 0;
}
- int64_t v;
- v = Px[0] * (int64_t)iDet;
- wm->wmmat[2] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift));
- v = Px[1] * (int64_t)iDet;
- wm->wmmat[3] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift));
- v = ((int64_t)dux * (1 << WARPEDMODEL_PREC_BITS)) -
- (int64_t)sux * wm->wmmat[2] - (int64_t)suy * wm->wmmat[3];
- wm->wmmat[0] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED(v, 3));
+ wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
+ wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
+ wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
+ wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
- v = Py[0] * (int64_t)iDet;
- wm->wmmat[4] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift));
- v = Py[1] * (int64_t)iDet;
- wm->wmmat[5] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift));
- v = ((int64_t)duy * (1 << WARPEDMODEL_PREC_BITS)) -
- (int64_t)sux * wm->wmmat[4] - (int64_t)suy * wm->wmmat[5];
- wm->wmmat[1] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED(v, 3));
+ // Note: In the vx, vy expressions below, the max value of each of the
+ // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
+ // for the first term so that the overall sum in the worst case fits
+ // within 32 bits overall.
+ int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
+ (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
+ isuy * wm->wmmat[3]);
+ int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
+ (isux * wm->wmmat[4] +
+ isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
+ wm->wmmat[0] =
+ clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
+ wm->wmmat[1] =
+ clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
wm->wmmat[6] = wm->wmmat[7] = 0;
-
- // Clamp values
- wm->wmmat[0] = clamp(wm->wmmat[0], -WARPEDMODEL_TRANS_CLAMP,
- WARPEDMODEL_TRANS_CLAMP - 1);
- wm->wmmat[1] = clamp(wm->wmmat[1], -WARPEDMODEL_TRANS_CLAMP,
- WARPEDMODEL_TRANS_CLAMP - 1);
- wm->wmmat[2] = clamp(wm->wmmat[2], -WARPEDMODEL_DIAGAFFINE_CLAMP,
- WARPEDMODEL_DIAGAFFINE_CLAMP - 1);
- wm->wmmat[5] = clamp(wm->wmmat[5], -WARPEDMODEL_DIAGAFFINE_CLAMP,
- WARPEDMODEL_DIAGAFFINE_CLAMP - 1);
- wm->wmmat[3] = clamp(wm->wmmat[3], -WARPEDMODEL_NONDIAGAFFINE_CLAMP,
- WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
- wm->wmmat[4] = clamp(wm->wmmat[4], -WARPEDMODEL_NONDIAGAFFINE_CLAMP,
- WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
return 0;
}