Revert "Limit to 192 filters for warp, clamp index since in some cases index 192"
This reverts commit 266db85d4ac58207188784188696b294b30c3892.
Reason for revert: Reverting to prevent software slowdown. Will be implemented differently in a separate patch.
Change-Id: I386a9661c87d69e22761e5c01507f2f1f968433f
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index 5cc11fb..4825c81 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -498,7 +498,7 @@
// [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
// We need an extra 2 taps to fit this in, for a total of 8 taps.
/* clang-format off */
-const int16_t warped_filter_taps[WARPEDPIXEL_PREC_SHIFTS * 3][8] = {
+const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
#if WARPEDPIXEL_PREC_BITS == 6
// [-1, 0)
{ 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 },
@@ -656,11 +656,10 @@
{0, 0, 1, -3, 8, 126, -5, 1}, {0, 0, 0, -1, 4, 127, -3, 1},
#endif // WARPEDPIXEL_PREC_BITS == 6
-};
-const int16_t *av1_get_warped_filter(int offs) {
- return warped_filter_taps[(offs >= 192) ? 191 : offs];
-}
+ // dummy
+ { 0, 0, 0, 0, 1, 127, 0, 0 },
+};
/* clang-format on */
@@ -1025,7 +1024,7 @@
int ix = ix4 + l - 3;
const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS;
- const int16_t *coeffs = av1_get_warped_filter(offs);
+ const int16_t *coeffs = warped_filter[offs];
int32_t sum = 0;
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for (m = 0; m < 8; ++m) {
@@ -1050,7 +1049,7 @@
&pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS;
- const int16_t *coeffs = av1_get_warped_filter(offs);
+ const int16_t *coeffs = warped_filter[offs];
int32_t sum = 0;
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for (m = 0; m < 8; ++m) {
@@ -1286,7 +1285,7 @@
// At this point, sx = sx4 + alpha * l + beta * k
const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS;
- const int16_t *coeffs = av1_get_warped_filter(offs);
+ const int16_t *coeffs = warped_filter[offs];
int32_t sum = 0;
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for (m = 0; m < 8; ++m) {
@@ -1308,7 +1307,7 @@
// At this point, sy = sy4 + gamma * l + delta * k
const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS;
- const int16_t *coeffs = av1_get_warped_filter(offs);
+ const int16_t *coeffs = warped_filter[offs];
int32_t sum = 0;
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for (m = 0; m < 8; ++m) {
diff --git a/av1/common/warped_motion.h b/av1/common/warped_motion.h
index 550c08d..72b511f 100644
--- a/av1/common/warped_motion.h
+++ b/av1/common/warped_motion.h
@@ -33,7 +33,7 @@
#define DEFAULT_WMTYPE AFFINE
#endif // CONFIG_WARPED_MOTION
-const int16_t *av1_get_warped_filter(int offs);
+const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
typedef void (*ProjectPointsFunc)(int32_t *mat, int *points, int *proj,
const int n, const int stride_points,
diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c
index 31de232..0da714b 100644
--- a/av1/common/x86/warp_plane_sse2.c
+++ b/av1/common/x86/warp_plane_sse2.c
@@ -14,6 +14,8 @@
#include "./av1_rtcd.h"
#include "av1/common/warped_motion.h"
+static const __m128i *const filter = (const __m128i *const)warped_filter;
+
/* SSE2 version of the rotzoom/affine warp filter */
void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
int stride, uint8_t *pred, int p_col, int p_row,
@@ -96,14 +98,10 @@
_mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
// Filter even-index pixels
- __m128i tmp_0 = *(__m128i const *)av1_get_warped_filter(
- (sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS);
- __m128i tmp_2 = *(__m128i const *)av1_get_warped_filter(
- (sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS);
- __m128i tmp_4 = *(__m128i const *)av1_get_warped_filter(
- (sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS);
- __m128i tmp_6 = *(__m128i const *)av1_get_warped_filter(
- (sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS);
+ __m128i tmp_0 = filter[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_2 = filter[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_4 = filter[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_6 = filter[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS];
// coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
__m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
@@ -142,14 +140,10 @@
HORSHEAR_REDUCE_PREC_BITS);
// Filter odd-index pixels
- __m128i tmp_1 = *(__m128i const *)av1_get_warped_filter(
- (sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS);
- __m128i tmp_3 = *(__m128i const *)av1_get_warped_filter(
- (sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS);
- __m128i tmp_5 = *(__m128i const *)av1_get_warped_filter(
- (sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS);
- __m128i tmp_7 = *(__m128i const *)av1_get_warped_filter(
- (sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS);
+ __m128i tmp_1 = filter[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_3 = filter[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_5 = filter[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_7 = filter[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS];
__m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
__m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
@@ -197,14 +191,10 @@
__m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
// Filter even-index pixels
- __m128i tmp_0 = *(__m128i const *)av1_get_warped_filter(
- (sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS);
- __m128i tmp_2 = *(__m128i const *)av1_get_warped_filter(
- (sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS);
- __m128i tmp_4 = *(__m128i const *)av1_get_warped_filter(
- (sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS);
- __m128i tmp_6 = *(__m128i const *)av1_get_warped_filter(
- (sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS);
+ __m128i tmp_0 = filter[(sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_2 = filter[(sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_4 = filter[(sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_6 = filter[(sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS];
__m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
__m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
@@ -230,14 +220,10 @@
__m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
__m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
- __m128i tmp_1 = *(__m128i const *)av1_get_warped_filter(
- (sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS);
- __m128i tmp_3 = *(__m128i const *)av1_get_warped_filter(
- (sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS);
- __m128i tmp_5 = *(__m128i const *)av1_get_warped_filter(
- (sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS);
- __m128i tmp_7 = *(__m128i const *)av1_get_warped_filter(
- (sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS);
+ __m128i tmp_1 = filter[(sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_3 = filter[(sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_5 = filter[(sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS];
+ __m128i tmp_7 = filter[(sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS];
__m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
__m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);