JNT_COMP: add SIMD implementations for c functions
Add SIMD implementations for c functions for low bit-depth, making
encoder speed faster by 3~4x than c functions.
Change-Id: Icca0b07b25489759be9504aaec09d1239076fc52
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 845667d..4567f6e 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -59,7 +59,8 @@
set(AOM_DSP_COMMON_INTRIN_SSSE3
"${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c"
"${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c"
- "${AOM_ROOT}/aom_dsp/x86/inv_txfm_ssse3.c")
+ "${AOM_ROOT}/aom_dsp/x86/inv_txfm_ssse3.c"
+ "${AOM_ROOT}/aom_dsp/x86/variance_ssse3.c")
set(AOM_DSP_COMMON_INTRIN_SSE4_1
"${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c"
diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index 6d7d48e..594fab4 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -107,6 +107,7 @@
DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c
DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c
DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/variance_ssse3.c
# interpolation filters
DSP_SRCS-yes += aom_convolve.c
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 0b7fbca..5a487ab 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1110,6 +1110,7 @@
if (aom_config("CONFIG_JNT_COMP") eq "yes") {
add_proto qw/void aom_jnt_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
+ specialize qw/aom_jnt_comp_avg_upsampled_pred ssse3/;
}
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
@@ -1339,6 +1340,7 @@
add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
if (aom_config("CONFIG_JNT_COMP") eq "yes") {
add_proto qw/void aom_jnt_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
+ specialize qw/aom_jnt_comp_avg_pred ssse3/;
}
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index 16ad001..8498093 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c
@@ -214,7 +214,7 @@
\
aom_jnt_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
\
- return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
+ return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \
}
#else // CONFIG_JNT_COMP
#define SUBPIX_AVG_VAR(W, H) \
@@ -397,13 +397,11 @@
int i, j;
const int fwd_offset = jcp_param->fwd_offset;
const int bck_offset = jcp_param->bck_offset;
- double sum = bck_offset + fwd_offset;
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
- tmp = (int)(0.5 + tmp / sum);
- if (tmp > 255) tmp = 255;
+ tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
comp_pred[j] = (uint8_t)tmp;
}
comp_pred += width;
@@ -420,7 +418,6 @@
int i, j;
const int fwd_offset = jcp_param->fwd_offset;
const int bck_offset = jcp_param->bck_offset;
- double sum = bck_offset + fwd_offset;
aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
ref_stride);
@@ -428,8 +425,7 @@
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
- tmp = (int)(0.5 + tmp / sum);
- if (tmp > 255) tmp = 255;
+ tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
comp_pred[j] = (uint8_t)tmp;
}
comp_pred += width;
diff --git a/aom_dsp/x86/variance_ssse3.c b/aom_dsp/x86/variance_ssse3.c
new file mode 100644
index 0000000..ce573af
--- /dev/null
+++ b/aom_dsp/x86/variance_ssse3.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+#include <tmmintrin.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "./av1_rtcd.h"
+
+#if CONFIG_JNT_COMP
+static void compute_jnt_comp_avg(__m128i *p0, __m128i *p1, const __m128i *w,
+ const __m128i *r, void *const result) {
+ __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
+ __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
+ __m128i round_lo = _mm_add_epi16(mult_lo, *r);
+ __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
+
+ __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
+ __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
+ __m128i round_hi = _mm_add_epi16(mult_hi, *r);
+ __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
+
+ xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
+}
+
+void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride,
+ const JNT_COMP_PARAMS *jcp_param) {
+ int i;
+ const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
+ const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+ const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+ w1, w0, w1, w0);
+ const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+ const __m128i r =
+ _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+ if (width >= 16) {
+ // Read 16 pixels one row at a time
+ assert(!(width & 15));
+ for (i = 0; i < height; ++i) {
+ int j;
+ for (j = 0; j < width; j += 16) {
+ __m128i p0 = xx_loadu_128(ref);
+ __m128i p1 = xx_loadu_128(pred);
+
+ compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+ comp_pred += 16;
+ pred += 16;
+ ref += 16;
+ }
+ ref += ref_stride - width;
+ }
+ } else if (width >= 8) {
+ // Read 8 pixels two row at a time
+ assert(!(width & 7));
+ assert(!(width & 1));
+ for (i = 0; i < height; i += 2) {
+ __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
+ __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
+ __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
+ __m128i p1 = xx_loadu_128(pred);
+
+ compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+ comp_pred += 16;
+ pred += 16;
+ ref += 2 * ref_stride;
+ }
+ } else {
+ // Read 4 pixels four row at a time
+ assert(!(width & 3));
+ assert(!(height & 3));
+ for (i = 0; i < height; i += 4) {
+ __m128i p0_0 = xx_loadl_32(ref + 0 * ref_stride);
+ __m128i p0_1 = xx_loadl_32(ref + 1 * ref_stride);
+ __m128i p0_2 = xx_loadl_32(ref + 2 * ref_stride);
+ __m128i p0_3 = xx_loadl_32(ref + 3 * ref_stride);
+ __m128i p0 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(p0_0, p0_1),
+ _mm_unpacklo_epi32(p0_2, p0_3));
+ __m128i p1 = xx_loadu_128(pred);
+
+ compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+ comp_pred += 16;
+ pred += 16;
+ ref += 4 * ref_stride;
+ }
+ }
+}
+
+void aom_jnt_comp_avg_upsampled_pred_ssse3(uint8_t *comp_pred,
+ const uint8_t *pred, int width,
+ int height, int subpel_x_q3,
+ int subpel_y_q3, const uint8_t *ref,
+ int ref_stride,
+ const JNT_COMP_PARAMS *jcp_param) {
+ int n;
+ int i;
+ aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
+ ref_stride);
+ /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
+ assert(!(width * height & 15));
+ n = width * height >> 4;
+
+ const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
+ const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+ const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+ w1, w0, w1, w0);
+ const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+ const __m128i r =
+ _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+ for (i = 0; i < n; i++) {
+ __m128i p0 = xx_loadu_128(comp_pred);
+ __m128i p1 = xx_loadu_128(pred);
+
+ compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+ comp_pred += 16;
+ pred += 16;
+ }
+}
+#endif // CONFIG_JNT_COMP
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 80e7aff..81fcc4e 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -288,6 +288,9 @@
set(AOM_AV1_COMMON_INTRIN_SSE2
${AOM_AV1_COMMON_INTRIN_SSE2}
"${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c")
+ set(AOM_AV1_COMMON_INTRIN_SSE4_1
+ ${AOM_AV1_COMMON_INTRIN_SSE4_1}
+ "${AOM_ROOT}/av1/common/x86/convolve_2d_sse4.c")
if (CONFIG_HIGHBITDEPTH)
set(AOM_AV1_COMMON_INTRIN_SSSE3
${AOM_AV1_COMMON_INTRIN_SSSE3}
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 43432c8..326e678 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -155,6 +155,7 @@
ifeq ($(CONFIG_CONVOLVE_ROUND),yes)
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/convolve_2d_sse2.c
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/convolve_2d_sse4.c
ifeq ($(CONFIG_HIGHBITDEPTH),yes)
AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/highbd_convolve_2d_ssse3.c
endif
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index b408db3..93642d9 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -557,6 +557,11 @@
specialize qw/av1_convolve_2d_scale sse4_1/;
}
+ if (aom_config("CONFIG_JNT_COMP") eq "yes") {
+ add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+ specialize qw/av1_jnt_convolve_2d sse4_1/;
+ }
+
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_highbd_convolve_2d/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
specialize qw/av1_highbd_convolve_2d ssse3/;
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 7ee6bc2..b09a60b 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -417,7 +417,54 @@
sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
}
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+ if (conv_params->do_average)
+ dst[y * dst_stride + x] += res;
+ else
+ dst[y * dst_stride + x] = res;
+ }
+ }
+}
+
#if CONFIG_JNT_COMP
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride,
+ CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ int x, y, k;
+ uint8_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = w;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+
+ // horizontal filter
+ const uint8_t *src_horiz = src - fo_vert * src_stride;
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ for (y = 0; y < im_h; ++y) {
+ for (x = 0; x < w; ++x) {
+ int32_t sum = 0;
+ for (k = 0; k < filter_params_x->taps; ++k) {
+ sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+ }
+ im_block[y * im_stride + x] =
+ clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
+ }
+ }
+
+ // vertical filter
+ uint8_t *src_vert = im_block + fo_vert * im_stride;
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ CONV_BUF_TYPE sum = 0;
+ for (k = 0; k < filter_params_y->taps; ++k) {
+ sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+ }
+ CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
if (conv_params->bck_offset == -1) {
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
@@ -432,15 +479,10 @@
dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1);
}
}
-#else
- if (conv_params->do_average)
- dst[y * dst_stride + x] += res;
- else
- dst[y * dst_stride + x] = res;
-#endif // CONFIG_JNT_COMP
}
}
}
+#endif // CONFIG_JNT_COMP
void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
@@ -571,7 +613,60 @@
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
((1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1)));
+ if (conv_params->do_average)
+ dst[y * dst_stride + x] += res;
+ else
+ dst[y * dst_stride + x] = res;
+ }
+ }
+}
+
#if CONFIG_JNT_COMP
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride,
+ CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ int x, y, k;
+ int32_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = w;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int bd = 8;
+
+ // horizontal filter
+ const uint8_t *src_horiz = src - fo_vert * src_stride;
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ for (y = 0; y < im_h; ++y) {
+ for (x = 0; x < w; ++x) {
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (k = 0; k < filter_params_x->taps; ++k) {
+ sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+ }
+ assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+ im_block[y * im_stride + x] =
+ ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+ }
+ }
+
+ // vertical filter
+ int32_t *src_vert = im_block + fo_vert * im_stride;
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ CONV_BUF_TYPE sum = 1 << offset_bits;
+ for (k = 0; k < filter_params_y->taps; ++k) {
+ sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+ }
+ assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+ CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+ ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
if (conv_params->fwd_offset == -1) {
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
@@ -586,15 +681,10 @@
dst[y * dst_stride + x] = res * conv_params->fwd_offset;
}
}
-#else
- if (conv_params->do_average)
- dst[y * dst_stride + x] += res;
- else
- dst[y * dst_stride + x] = res;
-#endif // CONFIG_JNT_COMP
}
}
}
+#endif // CONFIG_JNT_COMP
void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
@@ -716,15 +806,15 @@
// horizontal and vertical parameters are swapped because of the transpose
#if CONFIG_JNT_COMP
if (scaled)
- av1_convolve_2d_scale_c(tr_src + fo_horiz * tr_src_stride + fo_vert,
- tr_src_stride, tr_dst, tr_dst_stride, h, w,
- &filter_params_y, &filter_params_x, subpel_y_q4,
- y_step_q4, subpel_x_q4, x_step_q4, conv_params);
+ av1_convolve_2d_scale(tr_src + fo_horiz * tr_src_stride + fo_vert,
+ tr_src_stride, tr_dst, tr_dst_stride, h, w,
+ &filter_params_y, &filter_params_x, subpel_y_q4,
+ y_step_q4, subpel_x_q4, x_step_q4, conv_params);
else
- av1_convolve_2d_c(tr_src + fo_horiz * tr_src_stride + fo_vert,
- tr_src_stride, tr_dst, tr_dst_stride, h, w,
- &filter_params_y, &filter_params_x, subpel_y_q4,
- subpel_x_q4, conv_params);
+ av1_jnt_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
+ tr_src_stride, tr_dst, tr_dst_stride, h, w,
+ &filter_params_y, &filter_params_x, subpel_y_q4,
+ subpel_x_q4, conv_params);
#else
if (scaled)
av1_convolve_2d_scale(tr_src + fo_horiz * tr_src_stride + fo_vert,
@@ -742,15 +832,15 @@
} else {
#if CONFIG_JNT_COMP
if (scaled)
- av1_convolve_2d_scale_c(src, src_stride, conv_params->dst,
- conv_params->dst_stride, w, h, &filter_params_x,
- &filter_params_y, subpel_x_q4, x_step_q4,
- subpel_y_q4, y_step_q4, conv_params);
+ av1_convolve_2d_scale(src, src_stride, conv_params->dst,
+ conv_params->dst_stride, w, h, &filter_params_x,
+ &filter_params_y, subpel_x_q4, x_step_q4,
+ subpel_y_q4, y_step_q4, conv_params);
else
- av1_convolve_2d_c(src, src_stride, conv_params->dst,
- conv_params->dst_stride, w, h, &filter_params_x,
- &filter_params_y, subpel_x_q4, subpel_y_q4,
- conv_params);
+ av1_jnt_convolve_2d(src, src_stride, conv_params->dst,
+ conv_params->dst_stride, w, h, &filter_params_x,
+ &filter_params_y, subpel_x_q4, subpel_y_q4,
+ conv_params);
#else
if (scaled)
av1_convolve_2d_scale(src, src_stride, conv_params->dst,
diff --git a/av1/common/x86/av1_convolve_scale_sse4.c b/av1/common/x86/av1_convolve_scale_sse4.c
index 1f0fedb..c877f64 100644
--- a/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/av1/common/x86/av1_convolve_scale_sse4.c
@@ -260,6 +260,12 @@
(1 << (offset_bits - conv_params->round_1 - 1)));
const __m128i sub = _mm_set1_epi32(sub32);
+#if CONFIG_JNT_COMP
+ const __m128i fwd_offset = _mm_set1_epi32(conv_params->fwd_offset);
+ const __m128i bck_offset = _mm_set1_epi32(conv_params->bck_offset);
+ const __m128i jnt_round = _mm_set1_epi32(1 << (DIST_PRECISION_BITS - 2));
+#endif // CONFIG_JNT_COMP
+
int y_qn = subpel_y_qn;
for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
const int32_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
@@ -305,10 +311,29 @@
const __m128i subbed = _mm_sub_epi32(shifted, sub);
int32_t *dst_x = dst + y * dst_stride + x;
+#if CONFIG_JNT_COMP
+ __m128i result;
+ if (conv_params->fwd_offset != -1 && conv_params->bck_offset != -1) {
+ if (conv_params->do_average) {
+ result = _mm_srai_epi32(
+ _mm_add_epi32(_mm_add_epi32(_mm_loadu_si128((__m128i *)dst_x),
+ _mm_mullo_epi32(subbed, bck_offset)),
+ jnt_round),
+ DIST_PRECISION_BITS - 1);
+ } else {
+ result = _mm_mullo_epi32(subbed, fwd_offset);
+ }
+ } else {
+ result = (conv_params->do_average)
+ ? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
+ : subbed;
+ }
+#else
const __m128i result =
(conv_params->do_average)
? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
: subbed;
+#endif // CONFIG_JNT_COMP
_mm_storeu_si128((__m128i *)dst_x, result);
}
@@ -317,10 +342,24 @@
CONV_BUF_TYPE sum = 1 << offset_bits;
for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - sub32;
- if (conv_params->do_average)
- dst[y * dst_stride + x] += res;
- else
- dst[y * dst_stride + x] = res;
+#if CONFIG_JNT_COMP
+ if (conv_params->fwd_offset != -1 && conv_params->bck_offset != -1) {
+ if (conv_params->do_average) {
+ dst[y * dst_stride + x] += res * conv_params->bck_offset;
+
+ dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1);
+ } else {
+ dst[y * dst_stride + x] = res * conv_params->fwd_offset;
+ }
+ } else {
+#endif // CONFIG_JNT_COMP
+ if (conv_params->do_average)
+ dst[y * dst_stride + x] += res;
+ else
+ dst[y * dst_stride + x] = res;
+#if CONFIG_JNT_COMP
+ }
+#endif // CONFIG_JNT_COMP
}
}
}
@@ -342,6 +381,12 @@
(1 << (offset_bits - conv_params->round_1 - 1)));
const __m128i sub = _mm_set1_epi32(sub32);
+#if CONFIG_JNT_COMP
+ const __m128i fwd_offset = _mm_set1_epi32(conv_params->fwd_offset);
+ const __m128i bck_offset = _mm_set1_epi32(conv_params->bck_offset);
+ const __m128i jnt_round = _mm_set1_epi32(1 << (DIST_PRECISION_BITS - 2));
+#endif // CONFIG_JNT_COMP
+
int y_qn = subpel_y_qn;
for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
const int32_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
@@ -384,10 +429,29 @@
const __m128i subbed = _mm_sub_epi32(shifted, sub);
int32_t *dst_x = dst + y * dst_stride + x;
+#if CONFIG_JNT_COMP
+ __m128i result;
+ if (conv_params->fwd_offset != -1 && conv_params->bck_offset != -1) {
+ if (conv_params->do_average) {
+ result = _mm_srai_epi32(
+ _mm_add_epi32(_mm_add_epi32(_mm_loadu_si128((__m128i *)dst_x),
+ _mm_mullo_epi32(subbed, bck_offset)),
+ jnt_round),
+ DIST_PRECISION_BITS - 1);
+ } else {
+ result = _mm_mullo_epi32(subbed, fwd_offset);
+ }
+ } else {
+ result = (conv_params->do_average)
+ ? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
+ : subbed;
+ }
+#else
const __m128i result =
(conv_params->do_average)
? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
: subbed;
+#endif // CONFIG_JNT_COMP
_mm_storeu_si128((__m128i *)dst_x, result);
}
@@ -396,10 +460,24 @@
CONV_BUF_TYPE sum = 1 << offset_bits;
for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - sub32;
- if (conv_params->do_average)
- dst[y * dst_stride + x] += res;
- else
- dst[y * dst_stride + x] = res;
+#if CONFIG_JNT_COMP
+ if (conv_params->fwd_offset != -1 && conv_params->bck_offset != -1) {
+ if (conv_params->do_average) {
+ dst[y * dst_stride + x] += res * conv_params->bck_offset;
+
+ dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1);
+ } else {
+ dst[y * dst_stride + x] = res * conv_params->fwd_offset;
+ }
+ } else {
+#endif // CONFIG_JNT_COMP
+ if (conv_params->do_average)
+ dst[y * dst_stride + x] += res;
+ else
+ dst[y * dst_stride + x] = res;
+#if CONFIG_JNT_COMP
+ }
+#endif // CONFIG_JNT_COMP
}
}
}
diff --git a/av1/common/x86/convolve_2d_sse4.c b/av1/common/x86/convolve_2d_sse4.c
new file mode 100644
index 0000000..a3f4649
--- /dev/null
+++ b/av1/common/x86/convolve_2d_sse4.c
@@ -0,0 +1,440 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+#if CONFIG_JNT_COMP
+#if CONFIG_COMPOUND_ROUND
+void av1_jnt_convolve_2d_sse4_1(const uint8_t *src, int src_stride,
+ CONV_BUF_TYPE *dst, int dst_stride, int w,
+ int h, InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ DECLARE_ALIGNED(16, uint8_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int do_average = conv_params->do_average;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const __m128i zero = _mm_setzero_si128();
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set_epi32(w0, w0, w0, w0);
+ const __m128i wt1 = _mm_set_epi32(w1, w1, w1, w1);
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const =
+ _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+ // Filter even-index pixels
+ const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ res = _mm_packus_epi16(res, res);
+ _mm_storel_epi64((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const =
+ _mm_set1_epi32((1 << conv_params->round_1) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const uint8_t *data = &im_block[i * im_stride + j];
+ const __m128i src_01 = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 0 * im_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 1 * im_stride)));
+ const __m128i src_23 = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 2 * im_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 3 * im_stride)));
+ const __m128i src_45 = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 4 * im_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 5 * im_stride)));
+ const __m128i src_67 = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 6 * im_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 7 * im_stride)));
+
+ const __m128i src_0 = _mm_unpacklo_epi8(src_01, zero);
+ const __m128i src_2 = _mm_unpacklo_epi8(src_23, zero);
+ const __m128i src_4 = _mm_unpacklo_epi8(src_45, zero);
+ const __m128i src_6 = _mm_unpacklo_epi8(src_67, zero);
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpackhi_epi8(src_01, zero);
+ const __m128i src_3 = _mm_unpackhi_epi8(src_23, zero);
+ const __m128i src_5 = _mm_unpackhi_epi8(src_45, zero);
+ const __m128i src_7 = _mm_unpackhi_epi8(src_67, zero);
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ if (conv_params->fwd_offset != -1 && conv_params->bck_offset != -1) {
+ // NOTE(chengchen):
+ // only this part is different from av1_convolve_2d_sse2
+ // original c function at: av1/common/convolve.c:
+ // av1_convolve_2d_c() and av1_jnt_convolve_2d_c()
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+ if (do_average) {
+ _mm_storeu_si128(
+ p + 0, _mm_srai_epi32(
+ _mm_add_epi32(_mm_loadu_si128(p + 0),
+ _mm_mullo_epi32(res_lo_round, wt1)),
+ DIST_PRECISION_BITS - 1));
+
+ _mm_storeu_si128(
+ p + 1, _mm_srai_epi32(
+ _mm_add_epi32(_mm_loadu_si128(p + 1),
+ _mm_mullo_epi32(res_hi_round, wt1)),
+ DIST_PRECISION_BITS - 1));
+ } else {
+ _mm_storeu_si128(p + 0, _mm_mullo_epi32(res_lo_round, wt0));
+ _mm_storeu_si128(p + 1, _mm_mullo_epi32(res_hi_round, wt0));
+ }
+ } else {
+ // Accumulate values into the destination buffer
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+ if (do_average) {
+ _mm_storeu_si128(
+ p + 0, _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
+ _mm_storeu_si128(
+ p + 1, _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+ } else {
+ _mm_storeu_si128(p + 0, res_lo_round);
+ _mm_storeu_si128(p + 1, res_hi_round);
+ }
+ }
+ }
+ }
+ }
+}
+#else // CONFIG_COMPOUND_ROUND
+void av1_jnt_convolve_2d_sse4_1(const uint8_t *src, int src_stride,
+ CONV_BUF_TYPE *dst, int dst_stride, int w,
+ int h, InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int do_average = conv_params->do_average;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const __m128i zero = _mm_setzero_si128();
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set_epi32(w0, w0, w0, w0);
+ const __m128i wt1 = _mm_set_epi32(w1, w1, w1, w1);
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+ // Filter even-index pixels
+ const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ if (conv_params->fwd_offset != -1 && conv_params->bck_offset != -1) {
+ // FIXME(chengchen): validate this implementation
+ // original c function at: av1/common/convolve.c: av1_convolve_2d_c
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+ if (do_average) {
+ _mm_storeu_si128(
+ p + 0, _mm_srai_epi32(
+ _mm_add_epi32(_mm_loadu_si128(p + 0),
+ _mm_mullo_epi32(res_lo_round, wt1)),
+ DIST_PRECISION_BITS - 1));
+
+ _mm_storeu_si128(
+ p + 1, _mm_srai_epi32(
+ _mm_add_epi32(_mm_loadu_si128(p + 1),
+ _mm_mullo_epi32(res_hi_round, wt1)),
+ DIST_PRECISION_BITS - 1));
+ } else {
+ _mm_storeu_si128(p + 0, _mm_mullo_epi32(res_lo_round, wt0));
+ _mm_storeu_si128(p + 1, _mm_mullo_epi32(res_hi_round, wt0));
+ }
+ } else {
+ // Accumulate values into the destination buffer
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+ if (do_average) {
+ _mm_storeu_si128(
+ p + 0, _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
+ _mm_storeu_si128(
+ p + 1, _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+ } else {
+ _mm_storeu_si128(p + 0, res_lo_round);
+ _mm_storeu_si128(p + 1, res_hi_round);
+ }
+ }
+ }
+ }
+ }
+}
+#endif // CONFIG_COMPOUND_ROUND
+#endif // CONFIG_JNT_COMP