Adjust the sample selection in Warped Motion

This CL follows the original one"Sample selection in warped motion", and
aims to reduce the number of samples gathered. We only search 1 above row
and 1 left column instead of 1/2 bsize range in each direction. The gain
is a little less, but not much.

Borg test result:
             avg_psnr ovr_psnr ssim
cam_lowres:  -0.225   -0.228  -0.231
lowres:      -0.081   -0.075  -0.114

The changes are wrapped in WARPED_MOTION_SORT_SAMPLES macro.

Change-Id: Ifb499c80470b2ffe7a40f68fc401589faada7730
diff --git a/av1/common/mvref_common.c b/av1/common/mvref_common.c
index 92a3df3..6c0cc215 100644
--- a/av1/common/mvref_common.c
+++ b/av1/common/mvref_common.c
@@ -1099,71 +1099,49 @@
   int ref_frame = mbmi0->ref_frame[0];
   int up_available = xd->up_available;
   int left_available = xd->left_available;
-  int i, mi_step = 1, np = 0, n, j, k;
+  int i, mi_step = 1, np = 0;
   int global_offset_c = mi_col * MI_SIZE;
   int global_offset_r = mi_row * MI_SIZE;
 
   const TileInfo *const tile = &xd->tile;
-  // Search nb range in the unit of mi
-  int bs =
-      (AOMMAX(xd->n8_w, xd->n8_h) > 1) ? (AOMMAX(xd->n8_w, xd->n8_h) >> 1) : 1;
-  int marked[16 * 32];  // max array size for 128x128
   int do_tl = 1;
   int do_tr = 1;
 
-  // scan the above rows
+  // scan the nearest above rows
   if (up_available) {
-    for (n = 0; n < bs; n++) {
-      int mi_row_offset = -1 * (n + 1);
+    int mi_row_offset = -1;
+    MODE_INFO *mi = xd->mi[mi_row_offset * xd->mi_stride];
+    MB_MODE_INFO *mbmi = &mi->mbmi;
+    uint8_t n8_w = mi_size_wide[mbmi->sb_type];
 
-      if (!n) {
-        MODE_INFO *mi = xd->mi[mi_row_offset * xd->mi_stride];
-        MB_MODE_INFO *mbmi = &mi->mbmi;
-        uint8_t n8_w = mi_size_wide[mbmi->sb_type];
+    if (xd->n8_w <= n8_w) {
+      // Handle "current block width <= above block width" case.
+      int col_offset = -mi_col % n8_w;
 
-        // Handle "current block width <= above block width" case.
-        if (xd->n8_w <= n8_w) {
-          int col_offset = -mi_col % n8_w;
+      if (col_offset < 0) do_tl = 0;
+      if (col_offset + n8_w > xd->n8_w) do_tr = 0;
 
-          if (col_offset < 0) do_tl = 0;
-          if (col_offset + n8_w > xd->n8_w) do_tr = 0;
-
-          if (mbmi->ref_frame[0] == ref_frame &&
-              mbmi->ref_frame[1] == NONE_FRAME) {
-            record_samples(mbmi, pts, pts_inref, pts_mv, global_offset_r,
-                           global_offset_c, 0, -1, col_offset, 1);
-            pts += 2;
-            pts_inref += 2;
-            pts_mv += 2;
-            np++;
-          }
-          break;
-        }
+      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+        record_samples(mbmi, pts, pts_inref, pts_mv, global_offset_r,
+                       global_offset_c, 0, -1, col_offset, 1);
+        pts += 2;
+        pts_inref += 2;
+        pts_mv += 2;
+        np++;
       }
-
+    } else {
       // Handle "current block width > above block width" case.
-      if (!n) memset(marked, 0, bs * xd->n8_w * sizeof(*marked));
-
       for (i = 0; i < AOMMIN(xd->n8_w, cm->mi_cols - mi_col); i += mi_step) {
         int mi_col_offset = i;
-        MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-        MB_MODE_INFO *mbmi = &mi->mbmi;
-        uint8_t n8_w = mi_size_wide[mbmi->sb_type];
-        uint8_t n8_h = mi_size_high[mbmi->sb_type];
-
+        mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+        mbmi = &mi->mbmi;
+        n8_w = mi_size_wide[mbmi->sb_type];
         mi_step = AOMMIN(xd->n8_w, n8_w);
 
-        // Processed already
-        if (marked[n * xd->n8_w + i]) continue;
-
-        for (j = 0; j < AOMMIN(bs, n8_h); j++)
-          for (k = 0; k < AOMMIN(xd->n8_w, n8_w); k++)
-            marked[(n + j) * xd->n8_w + i + k] = 1;
-
         if (mbmi->ref_frame[0] == ref_frame &&
             mbmi->ref_frame[1] == NONE_FRAME) {
           record_samples(mbmi, pts, pts_inref, pts_mv, global_offset_r,
-                         global_offset_c, -n, -1, i, 1);
+                         global_offset_c, 0, -1, i, 1);
           pts += 2;
           pts_inref += 2;
           pts_mv += 2;
@@ -1174,58 +1152,41 @@
   }
   assert(2 * np <= SAMPLES_ARRAY_SIZE);
 
-  // scan the left columns
+  // scan the nearest left columns
   if (left_available) {
-    for (n = 0; n < bs; n++) {
-      int mi_col_offset = -1 * (n + 1);
+    int mi_col_offset = -1;
 
-      if (!n) {
-        MODE_INFO *mi = xd->mi[mi_col_offset];
-        MB_MODE_INFO *mbmi = &mi->mbmi;
-        uint8_t n8_h = mi_size_high[mbmi->sb_type];
+    MODE_INFO *mi = xd->mi[mi_col_offset];
+    MB_MODE_INFO *mbmi = &mi->mbmi;
+    uint8_t n8_h = mi_size_high[mbmi->sb_type];
 
-        // Handle "current block height <= above block height" case.
-        if (xd->n8_h <= n8_h) {
-          int row_offset = -mi_row % n8_h;
+    if (xd->n8_h <= n8_h) {
+      // Handle "current block height <= above block height" case.
+      int row_offset = -mi_row % n8_h;
 
-          if (row_offset < 0) do_tl = 0;
+      if (row_offset < 0) do_tl = 0;
 
-          if (mbmi->ref_frame[0] == ref_frame &&
-              mbmi->ref_frame[1] == NONE_FRAME) {
-            record_samples(mbmi, pts, pts_inref, pts_mv, global_offset_r,
-                           global_offset_c, row_offset, 1, 0, -1);
-            pts += 2;
-            pts_inref += 2;
-            pts_mv += 2;
-            np++;
-          }
-          break;
-        }
+      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+        record_samples(mbmi, pts, pts_inref, pts_mv, global_offset_r,
+                       global_offset_c, row_offset, 1, 0, -1);
+        pts += 2;
+        pts_inref += 2;
+        pts_mv += 2;
+        np++;
       }
-
+    } else {
       // Handle "current block height > above block height" case.
-      if (!n) memset(marked, 0, bs * xd->n8_h * sizeof(*marked));
-
       for (i = 0; i < AOMMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
         int mi_row_offset = i;
-        MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-        MB_MODE_INFO *mbmi = &mi->mbmi;
-        uint8_t n8_w = mi_size_wide[mbmi->sb_type];
-        uint8_t n8_h = mi_size_high[mbmi->sb_type];
-
+        mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+        mbmi = &mi->mbmi;
+        n8_h = mi_size_high[mbmi->sb_type];
         mi_step = AOMMIN(xd->n8_h, n8_h);
 
-        // Processed already
-        if (marked[n * xd->n8_h + i]) continue;
-
-        for (j = 0; j < AOMMIN(bs, n8_w); j++)
-          for (k = 0; k < AOMMIN(xd->n8_h, n8_h); k++)
-            marked[(n + j) * xd->n8_h + i + k] = 1;
-
         if (mbmi->ref_frame[0] == ref_frame &&
             mbmi->ref_frame[1] == NONE_FRAME) {
           record_samples(mbmi, pts, pts_inref, pts_mv, global_offset_r,
-                         global_offset_c, i, 1, -n, -1);
+                         global_offset_c, i, 1, 0, -1);
           pts += 2;
           pts_inref += 2;
           pts_mv += 2;
diff --git a/av1/common/warped_motion.h b/av1/common/warped_motion.h
index 845eb9a..e05f6a8 100644
--- a/av1/common/warped_motion.h
+++ b/av1/common/warped_motion.h
@@ -30,10 +30,9 @@
 #define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
 
 #if WARPED_MOTION_SORT_SAMPLES
-// #define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2)
-// Search half bsize on the top and half bsize on the left, 1 upper-left block,
+// Search 1 row on the top and 1 column on the left, 1 upper-left block,
 // 1 upper-right block.
-#define SAMPLES_ARRAY_SIZE ((MAX_MIB_SIZE * MAX_MIB_SIZE + 2) * 2)
+#define SAMPLES_ARRAY_SIZE ((MAX_MIB_SIZE * 2 + 2) * 2)
 #else
 #define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2)
 #endif  // WARPED_MOTION_SORT_SAMPLES