Use 1 sample per neighbor for local warping model estimation

Only 1 sample needs to be collected. Max of 8 neighbors are
used.
In LS estimation, the projection samples (sx, sy)->(dx, dy) are
intentionally smoothed by assuming 3 shifted versions
(sx, sy+n)->(dx, dy+n), (sx+n, sy)->(dx+n, dy), (sx+n,
sy+n)->(dx+n, dy+n) also contribute to the estimation.
For example, instead of using A[0] = sx^2, we use the sum of
squares of source x of four points, A[0] += 4sx^2+4*n*sx+n^2.
But computational cost wise, it does not add much overhead. Coding
gain is mostly same as the old version. If no smoothing is added,
will lose 0.3% on lowres.

Change-Id: I04be32cffa525f7dc8ee583c0bf211d7bdc6e609
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 3cd0a02..12e5eab 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -1113,7 +1113,7 @@
     if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
 #endif
 #if CONFIG_WARPED_MOTION
-    if (!has_second_ref(mbmi) && mbmi->num_proj_ref[0] >= 3)
+    if (!has_second_ref(mbmi) && mbmi->num_proj_ref[0] >= 1)
       return WARPED_CAUSAL;
     else
 #endif  // CONFIG_WARPED_MOTION
diff --git a/av1/common/mvref_common.c b/av1/common/mvref_common.c
index 9e8f181..019a20f 100644
--- a/av1/common/mvref_common.c
+++ b/av1/common/mvref_common.c
@@ -1123,25 +1123,20 @@
         int bh = block_size_high[mbmi->sb_type];
         int cr_offset = -AOMMAX(bh, MI_SIZE) / 2 - 1;
         int cc_offset = i * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1;
-        int j;
-        int pixelperblock = SAMPLES_PER_NEIGHBOR;
+        int x = cc_offset + global_offset_c;
+        int y = cr_offset + global_offset_r;
 
-        for (j = 0; j < pixelperblock; j++) {
-          int x = cc_offset + j % 2 + global_offset_c;
-          int y = cr_offset + j / 2 + global_offset_r;
-
-          pts[0] = (x * 8);
-          pts[1] = (y * 8);
-          calc_projection_samples(mbmi,
+        pts[0] = (x * 8);
+        pts[1] = (y * 8);
+        calc_projection_samples(mbmi,
 #if CONFIG_GLOBAL_MOTION
-                                  xd,
+                                xd,
 #endif
-                                  x, y, pts_inref);
-
-          pts += 2;
-          pts_inref += 2;
-        }
-        np += pixelperblock;
+                                x, y, pts_inref);
+        pts += 2;
+        pts_inref += 2;
+        np++;
+        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
       }
     }
   }
@@ -1163,25 +1158,20 @@
         int bh = block_size_high[mbmi->sb_type];
         int cr_offset = i * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1;
         int cc_offset = -AOMMAX(bw, MI_SIZE) / 2 - 1;
-        int j;
-        int pixelperblock = SAMPLES_PER_NEIGHBOR;
+        int x = cc_offset + global_offset_c;
+        int y = cr_offset + global_offset_r;
 
-        for (j = 0; j < pixelperblock; j++) {
-          int x = cc_offset + j % 2 + global_offset_c;
-          int y = cr_offset + j / 2 + global_offset_r;
-
-          pts[0] = (x * 8);
-          pts[1] = (y * 8);
-          calc_projection_samples(mbmi,
+        pts[0] = (x * 8);
+        pts[1] = (y * 8);
+        calc_projection_samples(mbmi,
 #if CONFIG_GLOBAL_MOTION
-                                  xd,
+                                xd,
 #endif
-                                  x, y, pts_inref);
-
-          pts += 2;
-          pts_inref += 2;
-        }
-        np += pixelperblock;
+                                x, y, pts_inref);
+        pts += 2;
+        pts_inref += 2;
+        np++;
+        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
       }
     }
   }
@@ -1199,33 +1189,23 @@
       int bh = block_size_high[mbmi->sb_type];
       int cr_offset = -AOMMAX(bh, MI_SIZE) / 2 - 1;
       int cc_offset = -AOMMAX(bw, MI_SIZE) / 2 - 1;
-      int j;
-      int pixelperblock = SAMPLES_PER_NEIGHBOR;
+      int x = cc_offset + global_offset_c;
+      int y = cr_offset + global_offset_r;
 
-      for (j = 0; j < pixelperblock; j++) {
-        int x = cc_offset + j % 2 + global_offset_c;
-        int y = cr_offset + j / 2 + global_offset_r;
-
-        pts[0] = (x * 8);
-        pts[1] = (y * 8);
-        calc_projection_samples(mbmi,
+      pts[0] = (x * 8);
+      pts[1] = (y * 8);
+      calc_projection_samples(mbmi,
 #if CONFIG_GLOBAL_MOTION
-                                xd,
+                              xd,
 #endif
-                                x, y, pts_inref);
-
-        pts += 2;
-        pts_inref += 2;
-      }
-      np += pixelperblock;
+                              x, y, pts_inref);
+      pts += 2;
+      pts_inref += 2;
+      np++;
     }
   }
   assert(2 * np <= SAMPLES_ARRAY_SIZE);
 
-  if (np == 0) {
-    return 0;
-  }
-  assert(2 * np <= SAMPLES_ARRAY_SIZE);
   return np;
 }
 #endif  // CONFIG_WARPED_MOTION
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index adcfe3d..19bef93 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -1279,10 +1279,17 @@
 }
 
 #if CONFIG_WARPED_MOTION
-
 #define LEAST_SQUARES_ORDER 2
-#define LEAST_SQUARES_SAMPLES_MAX 32
-#define LEAST_SQUARES_MV_MAX 1024  // max mv in 1/8-pel
+
+#define LS_MV_MAX 1024  // max mv in 1/8-pel
+#define LS_STEP 2
+
+#define LS_SUM(a) ((a)*4 + LS_STEP * 2)
+#define LS_SQUARE(a) ((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2)
+#define LS_PRODUCT1(a, b) \
+  ((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP)
+#define LS_PRODUCT2(a, b) \
+  ((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2)
 
 #if LEAST_SQUARES_ORDER == 2
 static int find_affine_int(const int np, int *pts1, int *pts2, BLOCK_SIZE bsize,
@@ -1291,7 +1298,7 @@
   int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
   int32_t Bx[2] = { 0, 0 };
   int32_t By[2] = { 0, 0 };
-  int i, j, n = 0;
+  int i, n = 0;
 
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
@@ -1322,26 +1329,22 @@
   //
   // The loop below computes: A = P'P, Bx = P'q, By = P'r
   // We need to just compute inv(A).Bx and inv(A).By for the solutions.
-  for (j = 0; j < SAMPLES_PER_NEIGHBOR && n < LEAST_SQUARES_SAMPLES_MAX; ++j) {
-    int sx, sy, dx, dy;
-    // Contribution from neighbor block
-    for (i = j; i < np && n < LEAST_SQUARES_SAMPLES_MAX;
-         i += SAMPLES_PER_NEIGHBOR) {
-      dx = pts2[i * 2] - dux;
-      dy = pts2[i * 2 + 1] - duy;
-      sx = pts1[i * 2] - sux;
-      sy = pts1[i * 2 + 1] - suy;
-      if (abs(sx - dx) < LEAST_SQUARES_MV_MAX &&
-          abs(sy - dy) < LEAST_SQUARES_MV_MAX) {
-        A[0][0] += sx * sx;
-        A[0][1] += sx * sy;
-        A[1][1] += sy * sy;
-        Bx[0] += sx * dx;
-        Bx[1] += sy * dx;
-        By[0] += sx * dy;
-        By[1] += sy * dy;
-        n++;
-      }
+  int sx, sy, dx, dy;
+  // Contribution from neighbor block
+  for (i = 0; i < np && n < LEAST_SQUARES_SAMPLES_MAX; i++) {
+    dx = pts2[i * 2] - dux;
+    dy = pts2[i * 2 + 1] - duy;
+    sx = pts1[i * 2] - sux;
+    sy = pts1[i * 2 + 1] - suy;
+    if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
+      A[0][0] += LS_SQUARE(sx);
+      A[0][1] += LS_PRODUCT1(sx, sy);
+      A[1][1] += LS_SQUARE(sy);
+      Bx[0] += LS_PRODUCT2(sx, dx);
+      Bx[1] += LS_PRODUCT1(sy, dx);
+      By[0] += LS_PRODUCT1(sx, dy);
+      By[1] += LS_PRODUCT2(sy, dy);
+      n++;
     }
   }
   int64_t Px[2], Py[2];
@@ -1390,7 +1393,7 @@
   int32_t A[3][3] = { { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 } };
   int32_t Bx[3] = { 0, 0, 0 };
   int32_t By[3] = { 0, 0, 0 };
-  int i, j, n = 0, off;
+  int i, n = 0, off;
 
   int64_t C00, C01, C02, C11, C12, C22;
   int64_t Px[3], Py[3];
@@ -1421,55 +1424,48 @@
   // The loop below computes: A = P'P, Bx = P'q, By = P'r
   // We need to just compute inv(A).Bx and inv(A).By for the solutions.
   //
-  for (j = 0; j < SAMPLES_PER_NEIGHBOR && n < LEAST_SQUARES_SAMPLES_MAX; ++j) {
-    // Contribution from sample in current block
-    const int y_offset = j >> 1;
-    const int x_offset = j & 1;
-    int sx, sy, dx, dy;
-    sx = (cx_offset + x_offset) * 8;
-    sy = (cy_offset + y_offset) * 8;
-    dx = sx + mvx;
-    dy = sy + mvy;
-    if (abs(sx - dx) < LEAST_SQUARES_MV_MAX &&
-        abs(sy - dy) < LEAST_SQUARES_MV_MAX) {
-      A[0][0] += sx * sx;
-      A[0][1] += sx * sy;
-      A[0][2] += sx;
-      A[1][1] += sy * sy;
-      A[1][2] += sy;
-      A[2][2] += 1;
-      Bx[0] += sx * dx;
-      Bx[1] += sy * dx;
-      Bx[2] += dx;
-      By[0] += sx * dy;
-      By[1] += sy * dy;
-      By[2] += dy;
+  int sx, sy, dx, dy;
+  // Contribution from sample in current block
+  sx = cx_offset * 8;
+  sy = cy_offset * 8;
+  dx = sx + mvx;
+  dy = sy + mvy;
+  if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
+    A[0][0] += LS_SQUARE(sx);
+    A[0][1] += LS_PRODUCT1(sx, sy);
+    A[0][2] += LS_SUM(sx);
+    A[1][1] += LS_SQUARE(sy);
+    A[1][2] += LS_SUM(sy);
+    A[2][2] += 4;
+    Bx[0] += LS_PRODUCT2(sx, dx);
+    Bx[1] += LS_PRODUCT1(sy, dx);
+    Bx[2] += LS_SUM(dx);
+    By[0] += LS_PRODUCT1(sx, dy);
+    By[1] += LS_PRODUCT2(sy, dy);
+    By[2] += LS_SUM(dy);
+    n++;
+  }
+  // Contribution from neighbor block
+  for (i = 0; i < np && n < LEAST_SQUARES_SAMPLES_MAX; i++) {
+    dx = pts2[i * 2] - ux;
+    dy = pts2[i * 2 + 1] - uy;
+    sx = pts1[i * 2] - ux;
+    sy = pts1[i * 2 + 1] - uy;
+    if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
+      A[0][0] += LS_SQUARE(sx);
+      A[0][1] += LS_PRODUCT1(sx, sy);
+      A[0][2] += LS_SUM(sx);
+      A[1][1] += LS_SQUARE(sy);
+      A[1][2] += LS_SUM(sy);
+      A[2][2] += 4;
+      Bx[0] += LS_PRODUCT2(sx, dx);
+      Bx[1] += LS_PRODUCT1(sy, dx);
+      Bx[2] += LS_SUM(dx);
+      By[0] += LS_PRODUCT1(sx, dy);
+      By[1] += LS_PRODUCT2(sy, dy);
+      By[2] += LS_SUM(dy);
       n++;
     }
-    // Contribution from neighbor block
-    for (i = j; i < np && n < LEAST_SQUARES_SAMPLES_MAX;
-         i += SAMPLES_PER_NEIGHBOR) {
-      dx = pts2[i * 2] - ux;
-      dy = pts2[i * 2 + 1] - uy;
-      sx = pts1[i * 2] - ux;
-      sy = pts1[i * 2 + 1] - uy;
-      if (abs(sx - dx) < LEAST_SQUARES_MV_MAX &&
-          abs(sy - dy) < LEAST_SQUARES_MV_MAX) {
-        A[0][0] += sx * sx;
-        A[0][1] += sx * sy;
-        A[0][2] += sx;
-        A[1][1] += sy * sy;
-        A[1][2] += sy;
-        A[2][2] += 1;
-        Bx[0] += sx * dx;
-        Bx[1] += sy * dx;
-        Bx[2] += dx;
-        By[0] += sx * dy;
-        By[1] += sy * dy;
-        By[2] += dy;
-        n++;
-      }
-    }
   }
   // Compute Cofactors of A
   C00 = (int64_t)A[1][1] * A[2][2] - (int64_t)A[1][2] * A[1][2];
diff --git a/av1/common/warped_motion.h b/av1/common/warped_motion.h
index 0eb3ad5..98139dd 100644
--- a/av1/common/warped_motion.h
+++ b/av1/common/warped_motion.h
@@ -25,8 +25,8 @@
 
 #define MAX_PARAMDIM 9
 #if CONFIG_WARPED_MOTION
-#define SAMPLES_PER_NEIGHBOR 4
-#define SAMPLES_ARRAY_SIZE ((2 * MAX_MIB_SIZE + 2) * SAMPLES_PER_NEIGHBOR * 2)
+#define SAMPLES_ARRAY_SIZE ((2 * MAX_MIB_SIZE + 2) * 2)
+#define LEAST_SQUARES_SAMPLES_MAX 8
 #define DEFAULT_WMTYPE AFFINE
 #endif  // CONFIG_WARPED_MOTION