Some optimizations on integer affine estimation

1. Adds a limit on number of candidate samples used for the
estimation.
2. Adds a limit on max mv magnitude for use in the least-squares
3. Makes some of the internal variables 32-bit.

Impact on coding efficiency in the noise range.

Change-Id: I8c1c3216368ceb2e3548660a3b8c159df54a8312
diff --git a/aom_ports/mem.h b/aom_ports/mem.h
index 4789d29..7316399 100644
--- a/aom_ports/mem.h
+++ b/aom_ports/mem.h
@@ -46,6 +46,14 @@
   (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
                  : ROUND_POWER_OF_TWO((value), (n)))
 
+/* Shift down with rounding for use when n >= 0, value >= 0 for (64 bit) */
+#define ROUND_POWER_OF_TWO_64(value, n) \
+  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
+/* Shift down with rounding for signed integers, for use when n >= 0 (64 bit) */
+#define ROUND_POWER_OF_TWO_SIGNED_64(value, n)           \
+  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
+                 : ROUND_POWER_OF_TWO_64((value), (n)))
+
 #define ALIGN_POWER_OF_TWO(value, n) \
   (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
 
diff --git a/av1/common/mvref_common.c b/av1/common/mvref_common.c
index e8079e5..5088ad0 100644
--- a/av1/common/mvref_common.c
+++ b/av1/common/mvref_common.c
@@ -1150,7 +1150,6 @@
   int mvnumber = 0;
   int global_offset_c = mi_col * MI_SIZE;
   int global_offset_r = mi_row * MI_SIZE;
-  int samples_per_neighbor = 4;
 
   // scan the above row
   if (up_available) {
@@ -1169,7 +1168,7 @@
         int cr_offset = -AOMMAX(bh, MI_SIZE) / 2 - 1;
         int cc_offset = i * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1;
         int j;
-        int pixelperblock = samples_per_neighbor;
+        int pixelperblock = SAMPLES_PER_NEIGHBOR;
 
         mvasint[mvnumber] = mbmi->mv[0].as_int;
         mvnumber++;
@@ -1212,7 +1211,7 @@
         int cr_offset = i * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1;
         int cc_offset = -AOMMAX(bw, MI_SIZE) / 2 - 1;
         int j;
-        int pixelperblock = samples_per_neighbor;
+        int pixelperblock = SAMPLES_PER_NEIGHBOR;
 
         mvasint[mvnumber] = mbmi->mv[0].as_int;
         mvnumber++;
@@ -1251,7 +1250,7 @@
       int cr_offset = -AOMMAX(bh, MI_SIZE) / 2 - 1;
       int cc_offset = -AOMMAX(bw, MI_SIZE) / 2 - 1;
       int j;
-      int pixelperblock = samples_per_neighbor;
+      int pixelperblock = SAMPLES_PER_NEIGHBOR;
 
       mvasint[mvnumber] = mbmi->mv[0].as_int;
       mvnumber++;
@@ -1292,7 +1291,7 @@
     int cr_offset = AOMMAX(bh, MI_SIZE) / 2 - 1;
     int cc_offset = AOMMAX(bw, MI_SIZE) / 2 - 1;
     int j;
-    int pixelperblock = samples_per_neighbor;
+    int pixelperblock = SAMPLES_PER_NEIGHBOR;
 
     for (j = 0; j < pixelperblock; j++) {
       int r_offset = j / 2;
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index 3183352..a84c3c9 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -1216,16 +1216,21 @@
 #if CONFIG_WARPED_MOTION
 
 #define IDET_PREC_BITS 48
+#define LEAST_SQUARES_SAMPLES_MAX 32
+#define LEAST_SQUARES_MV_MAX 1024  // max mv in 1/8-pel
 #define IDET_WARPEDMODEL_DIFF_BITS (IDET_PREC_BITS - WARPEDMODEL_PREC_BITS)
+
 static int find_affine_int(const int np, int *pts1, int *pts2,
                            WarpedMotionParams *wm, int mi_row, int mi_col) {
-  int64_t A[3][3] = { { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 } };
-  int64_t C[3][3] = { { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 } };
-  int64_t Bx[3] = { 0, 0, 0 };
-  int64_t By[3] = { 0, 0, 0 };
+  int32_t A[3][3] = { { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 } };
+  int32_t Bx[3] = { 0, 0, 0 };
+  int32_t By[3] = { 0, 0, 0 };
+  int i, j, n = 0, off;
+
+  int64_t C00, C01, C02, C11, C12, C22;
   int64_t Px[3], Py[3];
-  int64_t Det, iDet;
-  int i, off;
+  int64_t Det, iDet, v;
+
   // Offsets to make the values in the arrays smaller
   const int ux = mi_col * MI_SIZE * 8, uy = mi_row * MI_SIZE * 8;
   // Let source points (xi, yi) map to destimation points (xi', yi'),
@@ -1245,79 +1250,83 @@
   //          y' = h3.x + h4.y + dy
   //
   // The loop below computes: A = P'P, Bx = P'q, By = P'r
-  for (i = 0; i < np; ++i) {
-    const int dx = *(pts2++) - ux;
-    const int dy = *(pts2++) - uy;
-    const int sx = *(pts1++) - ux;
-    const int sy = *(pts1++) - uy;
-    A[0][0] += sx * sx;
-    A[0][1] += sx * sy;
-    A[0][2] += sx;
-    A[1][1] += sy * sy;
-    A[1][2] += sy;
-    A[2][2] += 1;
-    Bx[0] += sx * dx;
-    Bx[1] += sy * dx;
-    Bx[2] += dx;
-    By[0] += sx * dy;
-    By[1] += sy * dy;
-    By[2] += dy;
+  // We need to just compute inv(A).Bx and inv(A).By for the solutions.
+  //
+  for (j = 0; j < SAMPLES_PER_NEIGHBOR && n < LEAST_SQUARES_SAMPLES_MAX; ++j) {
+    for (i = j; i < np && n < LEAST_SQUARES_SAMPLES_MAX;
+         i += SAMPLES_PER_NEIGHBOR) {
+      const int dx = pts2[i * 2] - ux;
+      const int dy = pts2[i * 2 + 1] - uy;
+      const int sx = pts1[i * 2] - ux;
+      const int sy = pts1[i * 2 + 1] - uy;
+      if (abs(sx - dx) >= LEAST_SQUARES_MV_MAX ||
+          abs(sy - dy) >= LEAST_SQUARES_MV_MAX)
+        continue;
+      A[0][0] += sx * sx;
+      A[0][1] += sx * sy;
+      A[0][2] += sx;
+      A[1][1] += sy * sy;
+      A[1][2] += sy;
+      A[2][2] += 1;
+      Bx[0] += sx * dx;
+      Bx[1] += sy * dx;
+      Bx[2] += dx;
+      By[0] += sx * dy;
+      By[1] += sy * dy;
+      By[2] += dy;
+      n++;
+    }
   }
   // Compute Cofactors of A
-  C[0][0] = A[1][1] * A[2][2] - A[1][2] * A[1][2];
-  C[0][1] = A[1][2] * A[0][2] - A[0][1] * A[2][2];
-  C[0][2] = A[0][1] * A[1][2] - A[0][2] * A[1][1];
-  C[1][1] = A[0][0] * A[2][2] - A[0][2] * A[0][2];
-  C[1][2] = A[0][1] * A[0][2] - A[0][0] * A[1][2];
-  C[2][2] = A[0][0] * A[1][1] - A[0][1] * A[0][1];
-
-  // Compute Determinant of A
-  Det = C[0][0] * A[0][0] + C[0][1] * A[0][1] + C[0][2] * A[0][2];
-
-  // These are the least squares solutions but need scaling.
-  Px[0] = C[0][0] * Bx[0] + C[0][1] * Bx[1] + C[0][2] * Bx[2];
-  Px[1] = C[0][1] * Bx[0] + C[1][1] * Bx[1] + C[1][2] * Bx[2];
-  Px[2] = C[0][2] * Bx[0] + C[1][2] * Bx[1] + C[2][2] * Bx[2];
-  Py[0] = C[0][0] * By[0] + C[0][1] * By[1] + C[0][2] * By[2];
-  Py[1] = C[0][1] * By[0] + C[1][1] * By[1] + C[1][2] * By[2];
-  Py[2] = C[0][2] * By[0] + C[1][2] * By[1] + C[2][2] * By[2];
+  C00 = (int64_t)A[1][1] * A[2][2] - (int64_t)A[1][2] * A[1][2];
+  C01 = (int64_t)A[1][2] * A[0][2] - (int64_t)A[0][1] * A[2][2];
+  C02 = (int64_t)A[0][1] * A[1][2] - (int64_t)A[0][2] * A[1][1];
+  C11 = (int64_t)A[0][0] * A[2][2] - (int64_t)A[0][2] * A[0][2];
+  C12 = (int64_t)A[0][1] * A[0][2] - (int64_t)A[0][0] * A[1][2];
+  C22 = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
 
   // Scale by 1/16
-  Px[0] = ROUND_POWER_OF_TWO_SIGNED(Px[0], 4);
-  Px[1] = ROUND_POWER_OF_TWO_SIGNED(Px[1], 4);
-  Px[2] = ROUND_POWER_OF_TWO_SIGNED(Px[2], 4);
-  Py[0] = ROUND_POWER_OF_TWO_SIGNED(Py[0], 4);
-  Py[1] = ROUND_POWER_OF_TWO_SIGNED(Py[1], 4);
-  Py[2] = ROUND_POWER_OF_TWO_SIGNED(Py[2], 4);
-  Det = ROUND_POWER_OF_TWO_SIGNED(Det, 4);
+  C00 = ROUND_POWER_OF_TWO_SIGNED(C00, 6);
+  C01 = ROUND_POWER_OF_TWO_SIGNED(C01, 6);
+  C02 = ROUND_POWER_OF_TWO_SIGNED(C02, 6);
+  C11 = ROUND_POWER_OF_TWO_SIGNED(C11, 6);
+  C12 = ROUND_POWER_OF_TWO_SIGNED(C12, 6);
+  C22 = ROUND_POWER_OF_TWO_SIGNED(C22, 6);
+
+  // Compute Determinant of A
+  Det = C00 * A[0][0] + C01 * A[0][1] + C02 * A[0][2];
   if (Det == 0) return 1;
 
+  // These divided by the Det, are the least squares solutions
+  Px[0] = C00 * Bx[0] + C01 * Bx[1] + C02 * Bx[2];
+  Px[1] = C01 * Bx[0] + C11 * Bx[1] + C12 * Bx[2];
+  Px[2] = C02 * Bx[0] + C12 * Bx[1] + C22 * Bx[2];
+  Py[0] = C00 * By[0] + C01 * By[1] + C02 * By[2];
+  Py[1] = C01 * By[0] + C11 * By[1] + C12 * By[2];
+  Py[2] = C02 * By[0] + C12 * By[1] + C22 * By[2];
+
   // Compute inverse of the Determinant
-  // TODO(debargha, yuec): Try to remove this only division
+  // TODO(debargha, yuec): Try to remove this only division if possible
   iDet = ((int64_t)1 << IDET_PREC_BITS) / Det;
 
-  wm->wmmat[2] = ((int64_t)Px[0] * (int64_t)iDet +
-                  ((int64_t)1 << (IDET_WARPEDMODEL_DIFF_BITS - 1))) >>
-                 IDET_WARPEDMODEL_DIFF_BITS;
-  wm->wmmat[3] = ((int64_t)Px[1] * (int64_t)iDet +
-                  ((int64_t)1 << (IDET_WARPEDMODEL_DIFF_BITS - 1))) >>
-                 IDET_WARPEDMODEL_DIFF_BITS;
-  wm->wmmat[0] = ((int64_t)Px[2] * (int64_t)iDet +
-                  ((int64_t)1 << (IDET_WARPEDMODEL_DIFF_BITS + 2))) >>
-                 (IDET_WARPEDMODEL_DIFF_BITS + 3);
+  v = Px[0] * iDet;
+  wm->wmmat[2] = ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS);
+  v = Px[1] * iDet;
+  wm->wmmat[3] = ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS);
+  v = Px[2] * iDet;
+  wm->wmmat[0] =
+      ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS + 3);
   // Adjust x displacement for the offset
   off = (ux << WARPEDMODEL_PREC_BITS) - ux * wm->wmmat[2] - uy * wm->wmmat[3];
   wm->wmmat[0] += ROUND_POWER_OF_TWO_SIGNED(off, 3);
 
-  wm->wmmat[4] = ((int64_t)Py[0] * (int64_t)iDet +
-                  ((int64_t)1 << (IDET_WARPEDMODEL_DIFF_BITS - 1))) >>
-                 IDET_WARPEDMODEL_DIFF_BITS;
-  wm->wmmat[5] = ((int64_t)Py[1] * (int64_t)iDet +
-                  ((int64_t)1 << (IDET_WARPEDMODEL_DIFF_BITS - 1))) >>
-                 IDET_WARPEDMODEL_DIFF_BITS;
-  wm->wmmat[1] = ((int64_t)Py[2] * (int64_t)iDet +
-                  ((int64_t)1 << (IDET_WARPEDMODEL_DIFF_BITS + 2))) >>
-                 (IDET_WARPEDMODEL_DIFF_BITS + 3);
+  v = Py[0] * iDet;
+  wm->wmmat[4] = ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS);
+  v = Py[1] * iDet;
+  wm->wmmat[5] = ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS);
+  v = Py[2] * iDet;
+  wm->wmmat[1] =
+      ROUND_POWER_OF_TWO_SIGNED_64(v, IDET_WARPEDMODEL_DIFF_BITS + 3);
   // Adjust y displacement for the offset
   off = (uy << WARPEDMODEL_PREC_BITS) - ux * wm->wmmat[4] - uy * wm->wmmat[5];
   wm->wmmat[1] += ROUND_POWER_OF_TWO_SIGNED(off, 3);