Add SSE2 vectorized warp filter for lowbd
End-to-end speed improvements: (measured on tempete_cif.y4m,
20 frames for encoder and all 260 frames for decoder)
* GLOBAL_MOTION encoder: ~10% faster
* GLOBAL_MOTION decoder: 100-200% faster depending on bitrate
* WARPED_MOTION encoder: ~2.5% faster
* WARPED_MOTION decoder: ~20-40% faster depending on bitrate
The improvement in the GLOBAL_MOTION decoder is particularly
large because its runtime is dominated by calls to warp_plane().
This introduces minor changes to the output of the warp filter,
but these should be rare.
Change-Id: I5813ab9e90311e27587045153c32d400b6b9eb92
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 04dd817..abc812a 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -157,4 +157,8 @@
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/filterintra_sse4.c
endif
+ifneq ($(findstring yes,$(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION)),)
+AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/warp_plane_sse2.c
+endif
+
$(eval $(call rtcd_h_template,av1_rtcd,av1/common/av1_rtcd_defs.pl))
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index e4d1e6e..f081c19 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -765,4 +765,12 @@
specialize qw/od_filter_dering_orthogonal_8x8 sse4_1/;
}
+# WARPED_MOTION / GLOBAL_MOTION functions
+
+if ((aom_config("CONFIG_WARPED_MOTION") eq "yes") ||
+ (aom_config("CONFIG_GLOBAL_MOTION") eq "yes")) {
+ add_proto qw/void warp_affine/, "int32_t *mat, uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int ref_frm, int32_t alpha, int32_t beta, int32_t gamma, int32_t delta";
+ specialize qw/warp_affine sse2/;
+}
+
1;
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index daded0c..d1107d5 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -15,6 +15,7 @@
#include <math.h>
#include <assert.h>
+#include "./av1_rtcd.h"
#include "av1/common/warped_motion.h"
/* clang-format off */
@@ -412,7 +413,7 @@
// [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
// We need an extra 2 taps to fit this in, for a total of 8 taps.
/* clang-format off */
-static const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS*3][8] = {
+const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS*3][8] = {
// [-1, 0)
{ 0, 0, 128, 0, 0, 0, 0, 0 }, { 0, - 1, 128, 2, - 1, 0, 0, 0 },
{ 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 },
@@ -891,7 +892,146 @@
within the block, the parameters must satisfy
4 * |alpha| + 7 * |beta| <= 1 and 4 * |gamma| + 7 * |delta| <= 1
for this filter to be applicable.
+
+ Note: warp_affine() assumes that the caller has done all of the relevant
+ checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
+ are set appropriately (if using a ROTZOOM model), and that alpha, beta,
+ gamma, delta are all in range.
+
+ TODO(david.barker): Maybe support scaled references?
*/
+static inline int16_t saturate_int16(int32_t v) {
+ if (v > 32767)
+ return 32767;
+ else if (v < -32768)
+ return -32768;
+ return v;
+}
+
+void warp_affine_c(int32_t *mat, uint8_t *ref, int width, int height,
+ int stride, uint8_t *pred, int p_col, int p_row, int p_width,
+ int p_height, int p_stride, int subsampling_x,
+ int subsampling_y, int ref_frm, int32_t alpha, int32_t beta,
+ int32_t gamma, int32_t delta) {
+ int16_t tmp[15 * 8];
+ int i, j, k, l, m;
+
+ /* Note: For this code to work, the left/right frame borders need to be
+ extended by at least 13 pixels each. By the time we get here, other
+ code will have set up this border, but we allow an explicit check
+ for debugging purposes.
+ */
+ /*for (i = 0; i < height; ++i) {
+ for (j = 0; j < 13; ++j) {
+ assert(ref[i * stride - 13 + j] == ref[i * stride]);
+ assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+ }
+ }*/
+
+ for (i = p_row; i < p_row + p_height; i += 8) {
+ for (j = p_col; j < p_col + p_width; j += 8) {
+ int32_t x4, y4, ix4, sx4, iy4, sy4;
+ if (subsampling_x)
+ x4 = ROUND_POWER_OF_TWO_SIGNED(
+ mat[2] * 2 * (j + 4) + mat[3] * 2 * (i + 4) + mat[0] +
+ (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+ 1);
+ else
+ x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
+
+ if (subsampling_y)
+ y4 = ROUND_POWER_OF_TWO_SIGNED(
+ mat[4] * 2 * (j + 4) + mat[5] * 2 * (i + 4) + mat[1] +
+ (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+ 1);
+ else
+ y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
+
+ ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+ sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+ sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ // Horizontal filter
+ for (k = -7; k < 8; ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ if (ix4 <= -7) {
+ // In this case, the rightmost pixel sampled is in column
+ // ix4 + 3 + 7 - 3 = ix4 + 7 <= 0, ie. the entire block
+ // will sample only from the leftmost column
+ // (once border extension is taken into account)
+ for (l = 0; l < 8; ++l) {
+ tmp[(k + 7) * 8 + l] =
+ ref[iy * stride] * (1 << WARPEDPIXEL_FILTER_BITS);
+ }
+ } else if (ix4 >= width + 6) {
+ // In this case, the leftmost pixel sampled is in column
+ // ix4 - 3 + 0 - 3 = ix4 - 6 >= width, ie. the entire block
+ // will sample only from the rightmost column
+ // (once border extension is taken into account)
+ for (l = 0; l < 8; ++l) {
+ tmp[(k + 7) * 8 + l] =
+ ref[iy * stride + (width - 1)] * (1 << WARPEDPIXEL_FILTER_BITS);
+ }
+ } else {
+ // If we get here, then
+ // the leftmost pixel sampled is
+ // ix4 - 4 + 0 - 3 = ix4 - 7 >= -13
+ // and the rightmost pixel sampled is at most
+ // ix4 + 3 + 7 - 3 = ix4 + 7 <= width + 12
+ // So, assuming that border extension has been done, we
+ // don't need to explicitly clamp values.
+ int sx = sx4 + alpha * (-4) + beta * k;
+
+ for (l = -4; l < 4; ++l) {
+ int ix = ix4 + l - 3;
+ // At this point, sx = sx4 + alpha * l + beta * k
+ const int16_t *coeffs =
+ warped_filter[ROUND_POWER_OF_TWO_SIGNED(sx,
+ WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS];
+ int32_t sum = 0;
+ for (m = 0; m < 8; ++m) {
+ sum += ref[iy * stride + ix + m] * coeffs[m];
+ }
+ tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum);
+ sx += alpha;
+ }
+ }
+ }
+
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+ int sy = sy4 + gamma * (-4) + delta * k;
+ for (l = -4; l < 4; ++l) {
+ uint8_t *p =
+ &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+ // At this point, sy = sy4 + gamma * l + delta * k
+ const int16_t *coeffs = warped_filter[ROUND_POWER_OF_TWO_SIGNED(
+ sy, WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS];
+ int32_t sum = 0;
+ for (m = 0; m < 8; ++m) {
+ sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
+ }
+ sum =
+ clip_pixel(ROUND_POWER_OF_TWO(sum, 2 * WARPEDPIXEL_FILTER_BITS));
+ if (ref_frm)
+ *p = ROUND_POWER_OF_TWO(*p + sum, 1);
+ else
+ *p = sum;
+ sy += gamma;
+ }
+ }
+ }
+ }
+}
+
static void warp_plane(WarpedMotionParams *wm, uint8_t *ref, int width,
int height, int stride, uint8_t *pred, int p_col,
int p_row, int p_width, int p_height, int p_stride,
@@ -901,9 +1041,8 @@
wm->wmmat[5] = wm->wmmat[2];
wm->wmmat[4] = -wm->wmmat[3];
}
- if (wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) {
- int32_t tmp[15 * 8];
- int i, j, k, l, m;
+ if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) && x_scale == 16 &&
+ y_scale == 16) {
int32_t *mat = wm->wmmat;
int32_t alpha, beta, gamma, delta;
@@ -932,78 +1071,9 @@
return;
}
- for (i = p_row; i < p_row + p_height; i += 8) {
- for (j = p_col; j < p_col + p_width; j += 8) {
- int32_t x4, y4, ix4, sx4, iy4, sy4;
- if (subsampling_x)
- x4 = ROUND_POWER_OF_TWO_SIGNED(
- mat[2] * 2 * (j + 4) + mat[3] * 2 * (i + 4) + mat[0] +
- (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
- 1);
- else
- x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
-
- if (subsampling_y)
- y4 = ROUND_POWER_OF_TWO_SIGNED(
- mat[4] * 2 * (j + 4) + mat[5] * 2 * (i + 4) + mat[1] +
- (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
- 1);
- else
- y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
-
- ix4 = x4 >> WARPEDMODEL_PREC_BITS;
- sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
- iy4 = y4 >> WARPEDMODEL_PREC_BITS;
- sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
- // Horizontal filter
- for (k = -7; k < 8; ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
-
- for (l = -4; l < 4; ++l) {
- int ix = ix4 + l;
- int sx = ROUND_POWER_OF_TWO_SIGNED(sx4 + alpha * l + beta * k,
- WARPEDDIFF_PREC_BITS);
- const int16_t *coeffs = warped_filter[sx + WARPEDPIXEL_PREC_SHIFTS];
- int32_t sum = 0;
- for (m = 0; m < 8; ++m) {
- if (ix + m - 3 < 0)
- sum += ref[iy * stride] * coeffs[m];
- else if (ix + m - 3 > width - 1)
- sum += ref[iy * stride + width - 1] * coeffs[m];
- else
- sum += ref[iy * stride + ix + m - 3] * coeffs[m];
- }
- tmp[(k + 7) * 8 + (l + 4)] = sum;
- }
- }
-
- // Vertical filter
- for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
- for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
- uint8_t *p =
- &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
- int sy = ROUND_POWER_OF_TWO_SIGNED(sy4 + gamma * l + delta * k,
- WARPEDDIFF_PREC_BITS);
- const int16_t *coeffs = warped_filter[sy + WARPEDPIXEL_PREC_SHIFTS];
- int32_t sum = 0;
- for (m = 0; m < 8; ++m) {
- sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
- }
- sum = clip_pixel(
- ROUND_POWER_OF_TWO_SIGNED(sum, 2 * WARPEDPIXEL_FILTER_BITS));
- if (ref_frm)
- *p = ROUND_POWER_OF_TWO_SIGNED(*p + sum, 1);
- else
- *p = sum;
- }
- }
- }
- }
+ warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
+ p_height, p_stride, subsampling_x, subsampling_y, ref_frm,
+ alpha, beta, gamma, delta);
} else {
warp_plane_old(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
p_height, p_stride, subsampling_x, subsampling_y, x_scale,
diff --git a/av1/common/warped_motion.h b/av1/common/warped_motion.h
index 76ab53f..24eeb92 100644
--- a/av1/common/warped_motion.h
+++ b/av1/common/warped_motion.h
@@ -28,6 +28,8 @@
#define DEFAULT_WMTYPE AFFINE
#endif // CONFIG_WARPED_MOTION
+const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3][8];
+
typedef void (*ProjectPointsFunc)(int32_t *mat, int *points, int *proj,
const int n, const int stride_points,
const int stride_proj,
diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c
new file mode 100644
index 0000000..5ecc30e
--- /dev/null
+++ b/av1/common/x86/warp_plane_sse2.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+
+/* SSE2 version of the rotzoom/affine warp filter */
+void warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
+ int stride, uint8_t *pred, int p_col, int p_row,
+ int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y, int ref_frm,
+ int32_t alpha, int32_t beta, int32_t gamma,
+ int32_t delta) {
+ __m128i tmp[15];
+ int i, j, k, l;
+
+ /* Note: For this code to work, the left/right frame borders need to be
+ extended by at least 13 pixels each. By the time we get here, other
+ code will have set up this border, but we allow an explicit check
+ for debugging purposes.
+ */
+ /*for (i = 0; i < height; ++i) {
+ for (j = 0; j < 13; ++j) {
+ assert(ref[i * stride - 13 + j] == ref[i * stride]);
+ assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+ }
+ }*/
+
+ for (i = p_row; i < p_row + p_height; i += 8) {
+ for (j = p_col; j < p_col + p_width; j += 8) {
+ int32_t x4, y4, ix4, sx4, iy4, sy4;
+ if (subsampling_x)
+ x4 = ROUND_POWER_OF_TWO_SIGNED(
+ mat[2] * 2 * (j + 4) + mat[3] * 2 * (i + 4) + mat[0] +
+ (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+ 1);
+ else
+ x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
+
+ if (subsampling_y)
+ y4 = ROUND_POWER_OF_TWO_SIGNED(
+ mat[4] * 2 * (j + 4) + mat[5] * 2 * (i + 4) + mat[1] +
+ (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+ 1);
+ else
+ y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
+
+ ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+ sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+ sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ // Horizontal filter
+ for (k = -7; k < 8; ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ if (ix4 <= -7) {
+ // In this case, the rightmost pixel sampled is in column
+ // ix4 + 3 + 7 - 3 = ix4 + 7 <= 0, ie. the entire block
+ // will sample only from the leftmost column
+ // (once border extension is taken into account)
+ for (l = 0; l < 8; ++l) {
+ ((int16_t *)tmp)[(k + 7) * 8 + l] =
+ ref[iy * stride] * (1 << WARPEDPIXEL_FILTER_BITS);
+ }
+ } else if (ix4 >= width + 6) {
+ // In this case, the leftmost pixel sampled is in column
+ // ix4 - 3 + 0 - 3 = ix4 - 6 >= width, ie. the entire block
+ // will sample only from the rightmost column
+ // (once border extension is taken into account)
+ for (l = 0; l < 8; ++l) {
+ ((int16_t *)tmp)[(k + 7) * 8 + l] =
+ ref[iy * stride + (width - 1)] * (1 << WARPEDPIXEL_FILTER_BITS);
+ }
+ } else {
+ // If we get here, then
+ // the leftmost pixel sampled is
+ // ix4 - 4 + 0 - 3 = ix4 - 7 >= -13
+ // and the rightmost pixel sampled is at most
+ // ix4 + 3 + 7 - 3 = ix4 + 7 <= width + 12
+ // So, assuming that border extension has been done, we
+ // don't need to explicitly clamp values.
+ int sx = sx4 + alpha * (-4) + beta * k;
+
+ // Load per-pixel coefficients
+ __m128i coeffs[8];
+ for (l = 0; l < 8; ++l) {
+ coeffs[l] =
+ ((__m128i *)warped_filter)[ROUND_POWER_OF_TWO_SIGNED(
+ sx, WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS];
+ sx += alpha;
+ }
+
+ // Currently the coefficients are stored in the order
+ // 0 ... 7 for each pixel separately.
+ // Permute the coefficients into 8 registers:
+ // coeffs (0,1), (2,3), (4,5), (6,7) for
+ // i) the odd-index pixels, and ii) the even-index pixels
+ {
+ // 0 1 0 1 2 3 2 3 for 0, 2
+ __m128i tmp_0 = _mm_unpacklo_epi32(coeffs[0], coeffs[2]);
+ // 0 1 0 1 2 3 2 3 for 1, 3
+ __m128i tmp_1 = _mm_unpacklo_epi32(coeffs[1], coeffs[3]);
+ // 0 1 0 1 2 3 2 3 for 4, 6
+ __m128i tmp_2 = _mm_unpacklo_epi32(coeffs[4], coeffs[6]);
+ // 0 1 0 1 2 3 2 3 for 5, 7
+ __m128i tmp_3 = _mm_unpacklo_epi32(coeffs[5], coeffs[7]);
+ // 4 5 4 5 6 7 6 7 for 0, 2
+ __m128i tmp_4 = _mm_unpackhi_epi32(coeffs[0], coeffs[2]);
+ // 4 5 4 5 6 7 6 7 for 1, 3
+ __m128i tmp_5 = _mm_unpackhi_epi32(coeffs[1], coeffs[3]);
+ // 4 5 4 5 6 7 6 7 for 4, 6
+ __m128i tmp_6 = _mm_unpackhi_epi32(coeffs[4], coeffs[6]);
+ // 4 5 4 5 6 7 6 7 for 5, 7
+ __m128i tmp_7 = _mm_unpackhi_epi32(coeffs[5], coeffs[7]);
+
+ // 0 1 0 1 0 1 0 1 for 0, 2, 4, 6
+ __m128i coeff_a = _mm_unpacklo_epi64(tmp_0, tmp_2);
+ // 0 1 0 1 0 1 0 1 for 1, 3, 5, 7
+ __m128i coeff_b = _mm_unpacklo_epi64(tmp_1, tmp_3);
+ // 2 3 2 3 2 3 2 3 for 0, 2, 4, 6
+ __m128i coeff_c = _mm_unpackhi_epi64(tmp_0, tmp_2);
+ // 2 3 2 3 2 3 2 3 for 1, 3, 5, 7
+ __m128i coeff_d = _mm_unpackhi_epi64(tmp_1, tmp_3);
+ // 4 5 4 5 4 5 4 5 for 0, 2, 4, 6
+ __m128i coeff_e = _mm_unpacklo_epi64(tmp_4, tmp_6);
+ // 4 5 4 5 4 5 4 5 for 1, 3, 5, 7
+ __m128i coeff_f = _mm_unpacklo_epi64(tmp_5, tmp_7);
+ // 6 7 6 7 6 7 6 7 for 0, 2, 4, 6
+ __m128i coeff_g = _mm_unpackhi_epi64(tmp_4, tmp_6);
+ // 6 7 6 7 6 7 6 7 for 1, 3, 5, 7
+ __m128i coeff_h = _mm_unpackhi_epi64(tmp_5, tmp_7);
+
+ // Load source pixels and widen to 16 bits. We want the pixels
+ // ref[iy * stride + ix4 - 7] ... ref[iy * stride + ix4 + 7]
+ // inclusive.
+ __m128i zero = _mm_setzero_si128();
+ __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+ // Calculate filtered results
+ __m128i src_a = _mm_unpacklo_epi8(src, zero);
+ __m128i res_a = _mm_madd_epi16(src_a, coeff_a);
+ __m128i src_b = _mm_unpacklo_epi8(_mm_srli_si128(src, 1), zero);
+ __m128i res_b = _mm_madd_epi16(src_b, coeff_b);
+ __m128i src_c = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero);
+ __m128i res_c = _mm_madd_epi16(src_c, coeff_c);
+ __m128i src_d = _mm_unpacklo_epi8(_mm_srli_si128(src, 3), zero);
+ __m128i res_d = _mm_madd_epi16(src_d, coeff_d);
+ __m128i src_e = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero);
+ __m128i res_e = _mm_madd_epi16(src_e, coeff_e);
+ __m128i src_f = _mm_unpacklo_epi8(_mm_srli_si128(src, 5), zero);
+ __m128i res_f = _mm_madd_epi16(src_f, coeff_f);
+ __m128i src_g = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero);
+ __m128i res_g = _mm_madd_epi16(src_g, coeff_g);
+ __m128i src_h = _mm_unpacklo_epi8(_mm_srli_si128(src, 7), zero);
+ __m128i res_h = _mm_madd_epi16(src_h, coeff_h);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_a, res_e),
+ _mm_add_epi32(res_c, res_g));
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_b, res_f),
+ _mm_add_epi32(res_d, res_h));
+
+ // Combine results into one register.
+ // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
+ // as this order helps with the vertical filter.
+ tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
+ }
+ }
+ }
+
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+ int sy = sy4 + gamma * (-4) + delta * k;
+
+ // Load per-pixel coefficients
+ __m128i coeffs[8];
+ for (l = 0; l < 8; ++l) {
+ coeffs[l] = ((__m128i *)warped_filter)[ROUND_POWER_OF_TWO_SIGNED(
+ sy, WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS];
+ sy += gamma;
+ }
+
+ {
+ // Rearrange coefficients in the same way as for the horizontal filter
+ __m128i tmp_0 = _mm_unpacklo_epi32(coeffs[0], coeffs[2]);
+ __m128i tmp_1 = _mm_unpacklo_epi32(coeffs[1], coeffs[3]);
+ __m128i tmp_2 = _mm_unpacklo_epi32(coeffs[4], coeffs[6]);
+ __m128i tmp_3 = _mm_unpacklo_epi32(coeffs[5], coeffs[7]);
+ __m128i tmp_4 = _mm_unpackhi_epi32(coeffs[0], coeffs[2]);
+ __m128i tmp_5 = _mm_unpackhi_epi32(coeffs[1], coeffs[3]);
+ __m128i tmp_6 = _mm_unpackhi_epi32(coeffs[4], coeffs[6]);
+ __m128i tmp_7 = _mm_unpackhi_epi32(coeffs[5], coeffs[7]);
+
+ __m128i coeff_a = _mm_unpacklo_epi64(tmp_0, tmp_2);
+ __m128i coeff_b = _mm_unpacklo_epi64(tmp_1, tmp_3);
+ __m128i coeff_c = _mm_unpackhi_epi64(tmp_0, tmp_2);
+ __m128i coeff_d = _mm_unpackhi_epi64(tmp_1, tmp_3);
+ __m128i coeff_e = _mm_unpacklo_epi64(tmp_4, tmp_6);
+ __m128i coeff_f = _mm_unpacklo_epi64(tmp_5, tmp_7);
+ __m128i coeff_g = _mm_unpackhi_epi64(tmp_4, tmp_6);
+ __m128i coeff_h = _mm_unpackhi_epi64(tmp_5, tmp_7);
+
+ // Load from tmp and rearrange pairs of consecutive rows into the
+ // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+ __m128i *src = tmp + (k + 4);
+ __m128i src_a = _mm_unpacklo_epi16(src[0], src[1]);
+ __m128i src_b = _mm_unpackhi_epi16(src[0], src[1]);
+ __m128i src_c = _mm_unpacklo_epi16(src[2], src[3]);
+ __m128i src_d = _mm_unpackhi_epi16(src[2], src[3]);
+ __m128i src_e = _mm_unpacklo_epi16(src[4], src[5]);
+ __m128i src_f = _mm_unpackhi_epi16(src[4], src[5]);
+ __m128i src_g = _mm_unpacklo_epi16(src[6], src[7]);
+ __m128i src_h = _mm_unpackhi_epi16(src[6], src[7]);
+
+ // Calculate the filtered pixels
+ __m128i res_a = _mm_madd_epi16(src_a, coeff_a);
+ __m128i res_b = _mm_madd_epi16(src_b, coeff_b);
+ __m128i res_c = _mm_madd_epi16(src_c, coeff_c);
+ __m128i res_d = _mm_madd_epi16(src_d, coeff_d);
+ __m128i res_e = _mm_madd_epi16(src_e, coeff_e);
+ __m128i res_f = _mm_madd_epi16(src_f, coeff_f);
+ __m128i res_g = _mm_madd_epi16(src_g, coeff_g);
+ __m128i res_h = _mm_madd_epi16(src_h, coeff_h);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_a, res_c),
+ _mm_add_epi32(res_e, res_g));
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_b, res_d),
+ _mm_add_epi32(res_f, res_h));
+
+ // Round and pack down to pixels
+ __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ __m128i round_const =
+ _mm_set1_epi32((1 << (2 * WARPEDPIXEL_FILTER_BITS)) >> 1);
+
+ __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo, round_const), 2 * WARPEDPIXEL_FILTER_BITS);
+ __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi, round_const), 2 * WARPEDPIXEL_FILTER_BITS);
+
+ __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+ // Store, blending with 'pred' if needed
+ __m128i *p =
+ (__m128i *)&pred[(i - p_row + k + 4) * p_stride + (j - p_col)];
+
+ if (ref_frm) {
+ __m128i orig = _mm_loadl_epi64(p);
+ _mm_storel_epi64(p, _mm_avg_epu8(res_8bit, orig));
+ } else {
+ _mm_storel_epi64(p, res_8bit);
+ }
+ }
+ }
+ }
+ }
+}