Simplify warped motion parameter estimation The purpose of this change is to reduce the cycles needed for warped motion parameter estimation. Method 1: If we remove the 2-bit bit-depth reduction(as in patch set 2), the downshifting of A, Bx, By is also removed. The borg test result(over the baseline) is: avg_psnr ovr_psnr ssim lowres: 0.023 0.020 0.071 cam_lowres: -0.009 -0.017 -0.031 Method 2: In theory, the above change uses 2 more bits for elements of A, Bx, By. In patchset 3, we modified LS_STEP to be 8(1 full pixel), and now, the least 2 bits in A, Bx, By elements are always 0. Namely, 2-bit bit-depth reduction are achieved without extra operations. The borg test result(over the baseline) is: lowres: -0.004 -0.007 -0.023 cam_lowres: -0.031 -0.033 -0.045 This is a little better than patch set 2 result. Method 2 is the final choice. Change-Id: I945aaba412e2ea86b7d67e8a90741fdf395b94cd

commit: 763ccd8c0cd710b74022acc2da7380d79f73dcb7 [log] [tgz]
author: Yunqing Wang <yunqingwang@google.com> Fri Dec 01 17:07:20 2017 -0800
committer: Yunqing Wang <yunqingwang@google.com> Wed Dec 06 20:44:24 2017 +0000
tree: 0df297258902fef3ea95eceaa6cfc5dd6b5e1260
parent: 70539b10d90d6dfd8898605e68c3faffe1975120 [diff]
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index f3bc849..1a9c6dc 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c

@@ -1028,7 +1028,8 @@
 #define LEAST_SQUARES_ORDER 2
 
 #define LS_MV_MAX 256  // max mv in 1/8-pel
-#define LS_STEP 2
+// Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
+#define LS_STEP 8
 
 // Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
 // the precision needed is:
@@ -1049,13 +1050,17 @@
 #define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
 #define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
 
-#define LS_SUM(a) ((a)*4 + LS_STEP * 2)
-#define LS_SQUARE(a) \
-  (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> 2)
-#define LS_PRODUCT1(a, b) \
-  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> 2)
-#define LS_PRODUCT2(a, b) \
-  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> 2)
+// By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
+// 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
+#define LS_SQUARE(a)                                          \
+  (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
+   (2 + LS_MAT_DOWN_BITS))
+#define LS_PRODUCT1(a, b)                                           \
+  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
+   (2 + LS_MAT_DOWN_BITS))
+#define LS_PRODUCT2(a, b)                                               \
+  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
+   (2 + LS_MAT_DOWN_BITS))
 
 #define USE_LIMITED_PREC_MULT 0
 
@@ -1140,7 +1145,7 @@
   int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
   int32_t Bx[2] = { 0, 0 };
   int32_t By[2] = { 0, 0 };
-  int i, n = 0;
+  int i;
 
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
@@ -1175,11 +1180,14 @@
   // We need to just compute inv(A).Bx and inv(A).By for the solutions.
   int sx, sy, dx, dy;
   // Contribution from neighbor block
-  for (i = 0; i < np && n < LEAST_SQUARES_SAMPLES_MAX; i++) {
+  for (i = 0; i < np; i++) {
     dx = pts2[i * 2] - dux;
     dy = pts2[i * 2 + 1] - duy;
     sx = pts1[i * 2] - sux;
     sy = pts1[i * 2 + 1] - suy;
+    // (TODO)yunqing: This comparison wouldn't be necessary if the sample
+    // selection is done in find_samples(). Also, global offset can be removed
+    // while collecting samples.
     if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
       A[0][0] += LS_SQUARE(sx);
       A[0][1] += LS_PRODUCT1(sx, sy);
@@ -1188,32 +1196,17 @@
       Bx[1] += LS_PRODUCT1(sy, dx);
       By[0] += LS_PRODUCT1(sx, dy);
       By[1] += LS_PRODUCT2(sy, dy);
-      n++;
     }
   }
-  int downshift;
-  if (n >= 4)
-    downshift = LS_MAT_DOWN_BITS;
-  else if (n >= 2)
-    downshift = LS_MAT_DOWN_BITS - 1;
-  else
-    downshift = LS_MAT_DOWN_BITS - 2;
 
-  // Reduce precision by downshift bits
-  A[0][0] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[0][0], downshift), LS_MAT_MIN,
-                  LS_MAT_MAX);
-  A[0][1] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[0][1], downshift), LS_MAT_MIN,
-                  LS_MAT_MAX);
-  A[1][1] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[1][1], downshift), LS_MAT_MIN,
-                  LS_MAT_MAX);
-  Bx[0] = clamp(ROUND_POWER_OF_TWO_SIGNED(Bx[0], downshift), LS_MAT_MIN,
-                LS_MAT_MAX);
-  Bx[1] = clamp(ROUND_POWER_OF_TWO_SIGNED(Bx[1], downshift), LS_MAT_MIN,
-                LS_MAT_MAX);
-  By[0] = clamp(ROUND_POWER_OF_TWO_SIGNED(By[0], downshift), LS_MAT_MIN,
-                LS_MAT_MAX);
-  By[1] = clamp(ROUND_POWER_OF_TWO_SIGNED(By[1], downshift), LS_MAT_MIN,
-                LS_MAT_MAX);
+  // Just for debugging, and can be removed later.
+  assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
+  assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
+  assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
+  assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
+  assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
+  assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
+  assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
 
   int64_t Px[2], Py[2], Det;
   int16_t iDet, shift;
commit	763ccd8c0cd710b74022acc2da7380d79f73dcb7	[log] [tgz]
author	Yunqing Wang <yunqingwang@google.com>	Fri Dec 01 17:07:20 2017 -0800
committer	Yunqing Wang <yunqingwang@google.com>	Wed Dec 06 20:44:24 2017 +0000
tree	0df297258902fef3ea95eceaa6cfc5dd6b5e1260
parent	70539b10d90d6dfd8898605e68c3faffe1975120 [diff]