Revert "Limit to 192 filters for warp, clamp index since in some cases index 192"

This reverts commit 266db85d4ac58207188784188696b294b30c3892.

Reason for revert: Reverting to prevent software slowdown. Will be implemented differently in a separate patch.

Change-Id: I386a9661c87d69e22761e5c01507f2f1f968433f
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index 5cc11fb..4825c81 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -498,7 +498,7 @@
 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
 // We need an extra 2 taps to fit this in, for a total of 8 taps.
 /* clang-format off */
-const int16_t warped_filter_taps[WARPEDPIXEL_PREC_SHIFTS * 3][8] = {
+const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
 #if WARPEDPIXEL_PREC_BITS == 6
   // [-1, 0)
   { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
@@ -656,11 +656,10 @@
   {0, 0, 1,  -3,   8, 126,  -5, 1}, {0, 0, 0,  -1,   4, 127,  -3, 1},
 
 #endif  // WARPEDPIXEL_PREC_BITS == 6
-};
 
-const int16_t *av1_get_warped_filter(int offs) {
-  return warped_filter_taps[(offs >= 192) ? 191 : offs];
-}
+  // dummy
+  { 0, 0, 0, 0,   1, 127, 0, 0 },
+};
 
 /* clang-format on */
 
@@ -1025,7 +1024,7 @@
             int ix = ix4 + l - 3;
             const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
                              WARPEDPIXEL_PREC_SHIFTS;
-            const int16_t *coeffs = av1_get_warped_filter(offs);
+            const int16_t *coeffs = warped_filter[offs];
             int32_t sum = 0;
             // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
             for (m = 0; m < 8; ++m) {
@@ -1050,7 +1049,7 @@
               &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
-          const int16_t *coeffs = av1_get_warped_filter(offs);
+          const int16_t *coeffs = warped_filter[offs];
           int32_t sum = 0;
           // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
           for (m = 0; m < 8; ++m) {
@@ -1286,7 +1285,7 @@
             // At this point, sx = sx4 + alpha * l + beta * k
             const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
                              WARPEDPIXEL_PREC_SHIFTS;
-            const int16_t *coeffs = av1_get_warped_filter(offs);
+            const int16_t *coeffs = warped_filter[offs];
             int32_t sum = 0;
             // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
             for (m = 0; m < 8; ++m) {
@@ -1308,7 +1307,7 @@
           // At this point, sy = sy4 + gamma * l + delta * k
           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
                            WARPEDPIXEL_PREC_SHIFTS;
-          const int16_t *coeffs = av1_get_warped_filter(offs);
+          const int16_t *coeffs = warped_filter[offs];
           int32_t sum = 0;
           // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
           for (m = 0; m < 8; ++m) {
diff --git a/av1/common/warped_motion.h b/av1/common/warped_motion.h
index 550c08d..72b511f 100644
--- a/av1/common/warped_motion.h
+++ b/av1/common/warped_motion.h
@@ -33,7 +33,7 @@
 #define DEFAULT_WMTYPE AFFINE
 #endif  // CONFIG_WARPED_MOTION
 
-const int16_t *av1_get_warped_filter(int offs);
+const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
 
 typedef void (*ProjectPointsFunc)(int32_t *mat, int *points, int *proj,
                                   const int n, const int stride_points,
diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c
index 31de232..0da714b 100644
--- a/av1/common/x86/warp_plane_sse2.c
+++ b/av1/common/x86/warp_plane_sse2.c
@@ -14,6 +14,8 @@
 #include "./av1_rtcd.h"
 #include "av1/common/warped_motion.h"
 
+static const __m128i *const filter = (const __m128i *const)warped_filter;
+
 /* SSE2 version of the rotzoom/affine warp filter */
 void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
                           int stride, uint8_t *pred, int p_col, int p_row,
@@ -96,14 +98,10 @@
               _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
 
           // Filter even-index pixels
-          __m128i tmp_0 = *(__m128i const *)av1_get_warped_filter(
-              (sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS);
-          __m128i tmp_2 = *(__m128i const *)av1_get_warped_filter(
-              (sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS);
-          __m128i tmp_4 = *(__m128i const *)av1_get_warped_filter(
-              (sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS);
-          __m128i tmp_6 = *(__m128i const *)av1_get_warped_filter(
-              (sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS);
+          __m128i tmp_0 = filter[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS];
+          __m128i tmp_2 = filter[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS];
+          __m128i tmp_4 = filter[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS];
+          __m128i tmp_6 = filter[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS];
 
           // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
           __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
@@ -142,14 +140,10 @@
                                     HORSHEAR_REDUCE_PREC_BITS);
 
           // Filter odd-index pixels
-          __m128i tmp_1 = *(__m128i const *)av1_get_warped_filter(
-              (sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS);
-          __m128i tmp_3 = *(__m128i const *)av1_get_warped_filter(
-              (sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS);
-          __m128i tmp_5 = *(__m128i const *)av1_get_warped_filter(
-              (sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS);
-          __m128i tmp_7 = *(__m128i const *)av1_get_warped_filter(
-              (sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS);
+          __m128i tmp_1 = filter[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS];
+          __m128i tmp_3 = filter[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS];
+          __m128i tmp_5 = filter[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS];
+          __m128i tmp_7 = filter[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS];
 
           __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
           __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
@@ -197,14 +191,10 @@
         __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
 
         // Filter even-index pixels
-        __m128i tmp_0 = *(__m128i const *)av1_get_warped_filter(
-            (sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS);
-        __m128i tmp_2 = *(__m128i const *)av1_get_warped_filter(
-            (sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS);
-        __m128i tmp_4 = *(__m128i const *)av1_get_warped_filter(
-            (sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS);
-        __m128i tmp_6 = *(__m128i const *)av1_get_warped_filter(
-            (sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS);
+        __m128i tmp_0 = filter[(sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS];
+        __m128i tmp_2 = filter[(sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS];
+        __m128i tmp_4 = filter[(sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS];
+        __m128i tmp_6 = filter[(sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS];
 
         __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
         __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
@@ -230,14 +220,10 @@
         __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
         __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
 
-        __m128i tmp_1 = *(__m128i const *)av1_get_warped_filter(
-            (sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS);
-        __m128i tmp_3 = *(__m128i const *)av1_get_warped_filter(
-            (sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS);
-        __m128i tmp_5 = *(__m128i const *)av1_get_warped_filter(
-            (sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS);
-        __m128i tmp_7 = *(__m128i const *)av1_get_warped_filter(
-            (sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS);
+        __m128i tmp_1 = filter[(sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS];
+        __m128i tmp_3 = filter[(sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS];
+        __m128i tmp_5 = filter[(sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS];
+        __m128i tmp_7 = filter[(sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS];
 
         __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
         __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);