Use 1 sample per neighbor for local warping model estimation
Only 1 sample needs to be collected. Max of 8 neighbors are
used.
In LS estimation, the projection samples (sx, sy)->(dx, dy) are
intentionally smoothed by assuming 3 shifted versions
(sx, sy+n)->(dx, dy+n), (sx+n, sy)->(dx+n, dy), (sx+n,
sy+n)->(dx+n, dy+n) also contribute to the estimation.
For example, instead of using A[0] = sx^2, we use the sum of
squares of source x of four points, A[0] += 4sx^2+4*n*sx+n^2.
But computational cost wise, it does not add much overhead. Coding
gain is mostly same as the old version. If no smoothing is added,
will lose 0.3% on lowres.
Change-Id: I04be32cffa525f7dc8ee583c0bf211d7bdc6e609
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 3cd0a02..12e5eab 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -1113,7 +1113,7 @@
if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
#endif
#if CONFIG_WARPED_MOTION
- if (!has_second_ref(mbmi) && mbmi->num_proj_ref[0] >= 3)
+ if (!has_second_ref(mbmi) && mbmi->num_proj_ref[0] >= 1)
return WARPED_CAUSAL;
else
#endif // CONFIG_WARPED_MOTION
diff --git a/av1/common/mvref_common.c b/av1/common/mvref_common.c
index 9e8f181..019a20f 100644
--- a/av1/common/mvref_common.c
+++ b/av1/common/mvref_common.c
@@ -1123,25 +1123,20 @@
int bh = block_size_high[mbmi->sb_type];
int cr_offset = -AOMMAX(bh, MI_SIZE) / 2 - 1;
int cc_offset = i * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1;
- int j;
- int pixelperblock = SAMPLES_PER_NEIGHBOR;
+ int x = cc_offset + global_offset_c;
+ int y = cr_offset + global_offset_r;
- for (j = 0; j < pixelperblock; j++) {
- int x = cc_offset + j % 2 + global_offset_c;
- int y = cr_offset + j / 2 + global_offset_r;
-
- pts[0] = (x * 8);
- pts[1] = (y * 8);
- calc_projection_samples(mbmi,
+ pts[0] = (x * 8);
+ pts[1] = (y * 8);
+ calc_projection_samples(mbmi,
#if CONFIG_GLOBAL_MOTION
- xd,
+ xd,
#endif
- x, y, pts_inref);
-
- pts += 2;
- pts_inref += 2;
- }
- np += pixelperblock;
+ x, y, pts_inref);
+ pts += 2;
+ pts_inref += 2;
+ np++;
+ if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
}
}
}
@@ -1163,25 +1158,20 @@
int bh = block_size_high[mbmi->sb_type];
int cr_offset = i * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1;
int cc_offset = -AOMMAX(bw, MI_SIZE) / 2 - 1;
- int j;
- int pixelperblock = SAMPLES_PER_NEIGHBOR;
+ int x = cc_offset + global_offset_c;
+ int y = cr_offset + global_offset_r;
- for (j = 0; j < pixelperblock; j++) {
- int x = cc_offset + j % 2 + global_offset_c;
- int y = cr_offset + j / 2 + global_offset_r;
-
- pts[0] = (x * 8);
- pts[1] = (y * 8);
- calc_projection_samples(mbmi,
+ pts[0] = (x * 8);
+ pts[1] = (y * 8);
+ calc_projection_samples(mbmi,
#if CONFIG_GLOBAL_MOTION
- xd,
+ xd,
#endif
- x, y, pts_inref);
-
- pts += 2;
- pts_inref += 2;
- }
- np += pixelperblock;
+ x, y, pts_inref);
+ pts += 2;
+ pts_inref += 2;
+ np++;
+ if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
}
}
}
@@ -1199,33 +1189,23 @@
int bh = block_size_high[mbmi->sb_type];
int cr_offset = -AOMMAX(bh, MI_SIZE) / 2 - 1;
int cc_offset = -AOMMAX(bw, MI_SIZE) / 2 - 1;
- int j;
- int pixelperblock = SAMPLES_PER_NEIGHBOR;
+ int x = cc_offset + global_offset_c;
+ int y = cr_offset + global_offset_r;
- for (j = 0; j < pixelperblock; j++) {
- int x = cc_offset + j % 2 + global_offset_c;
- int y = cr_offset + j / 2 + global_offset_r;
-
- pts[0] = (x * 8);
- pts[1] = (y * 8);
- calc_projection_samples(mbmi,
+ pts[0] = (x * 8);
+ pts[1] = (y * 8);
+ calc_projection_samples(mbmi,
#if CONFIG_GLOBAL_MOTION
- xd,
+ xd,
#endif
- x, y, pts_inref);
-
- pts += 2;
- pts_inref += 2;
- }
- np += pixelperblock;
+ x, y, pts_inref);
+ pts += 2;
+ pts_inref += 2;
+ np++;
}
}
assert(2 * np <= SAMPLES_ARRAY_SIZE);
- if (np == 0) {
- return 0;
- }
- assert(2 * np <= SAMPLES_ARRAY_SIZE);
return np;
}
#endif // CONFIG_WARPED_MOTION
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index adcfe3d..19bef93 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -1279,10 +1279,17 @@
}
#if CONFIG_WARPED_MOTION
-
#define LEAST_SQUARES_ORDER 2
-#define LEAST_SQUARES_SAMPLES_MAX 32
-#define LEAST_SQUARES_MV_MAX 1024 // max mv in 1/8-pel
+
+#define LS_MV_MAX 1024 // max mv in 1/8-pel
+#define LS_STEP 2
+
+#define LS_SUM(a) ((a)*4 + LS_STEP * 2)
+#define LS_SQUARE(a) ((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2)
+#define LS_PRODUCT1(a, b) \
+ ((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP)
+#define LS_PRODUCT2(a, b) \
+ ((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2)
#if LEAST_SQUARES_ORDER == 2
static int find_affine_int(const int np, int *pts1, int *pts2, BLOCK_SIZE bsize,
@@ -1291,7 +1298,7 @@
int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
int32_t Bx[2] = { 0, 0 };
int32_t By[2] = { 0, 0 };
- int i, j, n = 0;
+ int i, n = 0;
const int bw = block_size_wide[bsize];
const int bh = block_size_high[bsize];
@@ -1322,26 +1329,22 @@
//
// The loop below computes: A = P'P, Bx = P'q, By = P'r
// We need to just compute inv(A).Bx and inv(A).By for the solutions.
- for (j = 0; j < SAMPLES_PER_NEIGHBOR && n < LEAST_SQUARES_SAMPLES_MAX; ++j) {
- int sx, sy, dx, dy;
- // Contribution from neighbor block
- for (i = j; i < np && n < LEAST_SQUARES_SAMPLES_MAX;
- i += SAMPLES_PER_NEIGHBOR) {
- dx = pts2[i * 2] - dux;
- dy = pts2[i * 2 + 1] - duy;
- sx = pts1[i * 2] - sux;
- sy = pts1[i * 2 + 1] - suy;
- if (abs(sx - dx) < LEAST_SQUARES_MV_MAX &&
- abs(sy - dy) < LEAST_SQUARES_MV_MAX) {
- A[0][0] += sx * sx;
- A[0][1] += sx * sy;
- A[1][1] += sy * sy;
- Bx[0] += sx * dx;
- Bx[1] += sy * dx;
- By[0] += sx * dy;
- By[1] += sy * dy;
- n++;
- }
+ int sx, sy, dx, dy;
+ // Contribution from neighbor block
+ for (i = 0; i < np && n < LEAST_SQUARES_SAMPLES_MAX; i++) {
+ dx = pts2[i * 2] - dux;
+ dy = pts2[i * 2 + 1] - duy;
+ sx = pts1[i * 2] - sux;
+ sy = pts1[i * 2 + 1] - suy;
+ if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
+ A[0][0] += LS_SQUARE(sx);
+ A[0][1] += LS_PRODUCT1(sx, sy);
+ A[1][1] += LS_SQUARE(sy);
+ Bx[0] += LS_PRODUCT2(sx, dx);
+ Bx[1] += LS_PRODUCT1(sy, dx);
+ By[0] += LS_PRODUCT1(sx, dy);
+ By[1] += LS_PRODUCT2(sy, dy);
+ n++;
}
}
int64_t Px[2], Py[2];
@@ -1390,7 +1393,7 @@
int32_t A[3][3] = { { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 } };
int32_t Bx[3] = { 0, 0, 0 };
int32_t By[3] = { 0, 0, 0 };
- int i, j, n = 0, off;
+ int i, n = 0, off;
int64_t C00, C01, C02, C11, C12, C22;
int64_t Px[3], Py[3];
@@ -1421,55 +1424,48 @@
// The loop below computes: A = P'P, Bx = P'q, By = P'r
// We need to just compute inv(A).Bx and inv(A).By for the solutions.
//
- for (j = 0; j < SAMPLES_PER_NEIGHBOR && n < LEAST_SQUARES_SAMPLES_MAX; ++j) {
- // Contribution from sample in current block
- const int y_offset = j >> 1;
- const int x_offset = j & 1;
- int sx, sy, dx, dy;
- sx = (cx_offset + x_offset) * 8;
- sy = (cy_offset + y_offset) * 8;
- dx = sx + mvx;
- dy = sy + mvy;
- if (abs(sx - dx) < LEAST_SQUARES_MV_MAX &&
- abs(sy - dy) < LEAST_SQUARES_MV_MAX) {
- A[0][0] += sx * sx;
- A[0][1] += sx * sy;
- A[0][2] += sx;
- A[1][1] += sy * sy;
- A[1][2] += sy;
- A[2][2] += 1;
- Bx[0] += sx * dx;
- Bx[1] += sy * dx;
- Bx[2] += dx;
- By[0] += sx * dy;
- By[1] += sy * dy;
- By[2] += dy;
+ int sx, sy, dx, dy;
+ // Contribution from sample in current block
+ sx = cx_offset * 8;
+ sy = cy_offset * 8;
+ dx = sx + mvx;
+ dy = sy + mvy;
+ if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
+ A[0][0] += LS_SQUARE(sx);
+ A[0][1] += LS_PRODUCT1(sx, sy);
+ A[0][2] += LS_SUM(sx);
+ A[1][1] += LS_SQUARE(sy);
+ A[1][2] += LS_SUM(sy);
+ A[2][2] += 4;
+ Bx[0] += LS_PRODUCT2(sx, dx);
+ Bx[1] += LS_PRODUCT1(sy, dx);
+ Bx[2] += LS_SUM(dx);
+ By[0] += LS_PRODUCT1(sx, dy);
+ By[1] += LS_PRODUCT2(sy, dy);
+ By[2] += LS_SUM(dy);
+ n++;
+ }
+ // Contribution from neighbor block
+ for (i = 0; i < np && n < LEAST_SQUARES_SAMPLES_MAX; i++) {
+ dx = pts2[i * 2] - ux;
+ dy = pts2[i * 2 + 1] - uy;
+ sx = pts1[i * 2] - ux;
+ sy = pts1[i * 2 + 1] - uy;
+ if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
+ A[0][0] += LS_SQUARE(sx);
+ A[0][1] += LS_PRODUCT1(sx, sy);
+ A[0][2] += LS_SUM(sx);
+ A[1][1] += LS_SQUARE(sy);
+ A[1][2] += LS_SUM(sy);
+ A[2][2] += 4;
+ Bx[0] += LS_PRODUCT2(sx, dx);
+ Bx[1] += LS_PRODUCT1(sy, dx);
+ Bx[2] += LS_SUM(dx);
+ By[0] += LS_PRODUCT1(sx, dy);
+ By[1] += LS_PRODUCT2(sy, dy);
+ By[2] += LS_SUM(dy);
n++;
}
- // Contribution from neighbor block
- for (i = j; i < np && n < LEAST_SQUARES_SAMPLES_MAX;
- i += SAMPLES_PER_NEIGHBOR) {
- dx = pts2[i * 2] - ux;
- dy = pts2[i * 2 + 1] - uy;
- sx = pts1[i * 2] - ux;
- sy = pts1[i * 2 + 1] - uy;
- if (abs(sx - dx) < LEAST_SQUARES_MV_MAX &&
- abs(sy - dy) < LEAST_SQUARES_MV_MAX) {
- A[0][0] += sx * sx;
- A[0][1] += sx * sy;
- A[0][2] += sx;
- A[1][1] += sy * sy;
- A[1][2] += sy;
- A[2][2] += 1;
- Bx[0] += sx * dx;
- Bx[1] += sy * dx;
- Bx[2] += dx;
- By[0] += sx * dy;
- By[1] += sy * dy;
- By[2] += dy;
- n++;
- }
- }
}
// Compute Cofactors of A
C00 = (int64_t)A[1][1] * A[2][2] - (int64_t)A[1][2] * A[1][2];
diff --git a/av1/common/warped_motion.h b/av1/common/warped_motion.h
index 0eb3ad5..98139dd 100644
--- a/av1/common/warped_motion.h
+++ b/av1/common/warped_motion.h
@@ -25,8 +25,8 @@
#define MAX_PARAMDIM 9
#if CONFIG_WARPED_MOTION
-#define SAMPLES_PER_NEIGHBOR 4
-#define SAMPLES_ARRAY_SIZE ((2 * MAX_MIB_SIZE + 2) * SAMPLES_PER_NEIGHBOR * 2)
+#define SAMPLES_ARRAY_SIZE ((2 * MAX_MIB_SIZE + 2) * 2)
+#define LEAST_SQUARES_SAMPLES_MAX 8
#define DEFAULT_WMTYPE AFFINE
#endif // CONFIG_WARPED_MOTION