Reduce prec of matrices/vectors for warp estimate

Reduces precision of matrices by 2 bits.

No material change in performance.

Change-Id: I549a27da1dcb381fb329c345ee280dbd86b45bac
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index 11fdd7b..c74609d 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -1425,6 +1425,25 @@
 #define LS_MV_MAX 256  // max mv in 1/8-pel
 #define LS_STEP 2
 
+// Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
+// the precision needed is:
+//   (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
+//   (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
+//   1 [for sign] +
+//   LEAST_SQUARES_SAMPLES_MAX_BITS
+//        [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
+// The value is 23
+#define LS_MAT_RANGE_BITS \
+  ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
+
+// Bit-depth reduction from the full-range
+#define LS_MAT_DOWN_BITS 2
+
+// bits range of A, Bx and By after downshifting
+#define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
+#define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
+#define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
+
 #define LS_SUM(a) ((a)*4 + LS_STEP * 2)
 #define LS_SQUARE(a) \
   (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> 2)
@@ -1489,9 +1508,32 @@
       n++;
     }
   }
-  int64_t Px[2], Py[2];
-  int64_t iDet, Det, v;
-  int16_t shift;
+  int downshift;
+  if (n >= 4)
+    downshift = LS_MAT_DOWN_BITS;
+  else if (n >= 2)
+    downshift = LS_MAT_DOWN_BITS - 1;
+  else
+    downshift = LS_MAT_DOWN_BITS - 2;
+
+  // Reduce precision by downshift bits
+  A[0][0] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[0][0], downshift), LS_MAT_MIN,
+                  LS_MAT_MAX);
+  A[0][1] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[0][1], downshift), LS_MAT_MIN,
+                  LS_MAT_MAX);
+  A[1][1] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[1][1], downshift), LS_MAT_MIN,
+                  LS_MAT_MAX);
+  Bx[0] = clamp(ROUND_POWER_OF_TWO_SIGNED(Bx[0], downshift), LS_MAT_MIN,
+                LS_MAT_MAX);
+  Bx[1] = clamp(ROUND_POWER_OF_TWO_SIGNED(Bx[1], downshift), LS_MAT_MIN,
+                LS_MAT_MAX);
+  By[0] = clamp(ROUND_POWER_OF_TWO_SIGNED(By[0], downshift), LS_MAT_MIN,
+                LS_MAT_MAX);
+  By[1] = clamp(ROUND_POWER_OF_TWO_SIGNED(By[1], downshift), LS_MAT_MIN,
+                LS_MAT_MAX);
+
+  int64_t Px[2], Py[2], Det;
+  int16_t iDet, shift;
 
   // These divided by the Det, are the least squares solutions
   Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
@@ -1509,16 +1551,17 @@
     shift = 0;
   }
 
-  v = Px[0] * iDet;
+  int64_t v;
+  v = Px[0] * (int64_t)iDet;
   wm->wmmat[2] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift);
-  v = Px[1] * iDet;
+  v = Px[1] * (int64_t)iDet;
   wm->wmmat[3] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift);
   v = (dux << WARPEDMODEL_PREC_BITS) - sux * wm->wmmat[2] - suy * wm->wmmat[3];
   wm->wmmat[0] = ROUND_POWER_OF_TWO_SIGNED(v, 3);
 
-  v = Py[0] * iDet;
+  v = Py[0] * (int64_t)iDet;
   wm->wmmat[4] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift);
-  v = Py[1] * iDet;
+  v = Py[1] * (int64_t)iDet;
   wm->wmmat[5] = ROUND_POWER_OF_TWO_SIGNED_64(v, shift);
   v = (duy << WARPEDMODEL_PREC_BITS) - sux * wm->wmmat[4] - suy * wm->wmmat[5];
   wm->wmmat[1] = ROUND_POWER_OF_TWO_SIGNED(v, 3);
@@ -1631,7 +1674,7 @@
   C12 = (int64_t)A[0][1] * A[0][2] - (int64_t)A[0][0] * A[1][2];
   C22 = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
 
-  // Scale by 1/16
+  // Scale by 1/64
   C00 = ROUND_POWER_OF_TWO_SIGNED(C00, 6);
   C01 = ROUND_POWER_OF_TWO_SIGNED(C01, 6);
   C02 = ROUND_POWER_OF_TWO_SIGNED(C02, 6);