JNT_COMP: add SIMD implementations for c functions

Add SIMD implementations for c functions for low bit-depth, making
encoder speed faster by 3~4x than c functions.

Change-Id: Icca0b07b25489759be9504aaec09d1239076fc52
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 845667d..4567f6e 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -59,7 +59,8 @@
 set(AOM_DSP_COMMON_INTRIN_SSSE3
     "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c"
     "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c"
-    "${AOM_ROOT}/aom_dsp/x86/inv_txfm_ssse3.c")
+    "${AOM_ROOT}/aom_dsp/x86/inv_txfm_ssse3.c"
+    "${AOM_ROOT}/aom_dsp/x86/variance_ssse3.c")
 
 set(AOM_DSP_COMMON_INTRIN_SSE4_1
     "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c"
diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index 6d7d48e..594fab4 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -107,6 +107,7 @@
 DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c
 DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c
 DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c
+DSP_SRCS-$(HAVE_SSSE3)  += x86/variance_ssse3.c
 
 # interpolation filters
 DSP_SRCS-yes += aom_convolve.c
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 0b7fbca..5a487ab 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1110,6 +1110,7 @@
 
   if (aom_config("CONFIG_JNT_COMP") eq "yes") {
     add_proto qw/void aom_jnt_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
+    specialize qw/aom_jnt_comp_avg_upsampled_pred ssse3/;
   }
 
   if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
@@ -1339,6 +1340,7 @@
   add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
   if (aom_config("CONFIG_JNT_COMP") eq "yes") {
     add_proto qw/void aom_jnt_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
+    specialize qw/aom_jnt_comp_avg_pred ssse3/;
   }
   if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
     add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index 16ad001..8498093 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c
@@ -214,7 +214,7 @@
                                                                           \
     aom_jnt_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
                                                                           \
-    return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);         \
+    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);             \
   }
 #else  // CONFIG_JNT_COMP
 #define SUBPIX_AVG_VAR(W, H)                                            \
@@ -397,13 +397,11 @@
   int i, j;
   const int fwd_offset = jcp_param->fwd_offset;
   const int bck_offset = jcp_param->bck_offset;
-  double sum = bck_offset + fwd_offset;
 
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
       int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
-      tmp = (int)(0.5 + tmp / sum);
-      if (tmp > 255) tmp = 255;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
       comp_pred[j] = (uint8_t)tmp;
     }
     comp_pred += width;
@@ -420,7 +418,6 @@
   int i, j;
   const int fwd_offset = jcp_param->fwd_offset;
   const int bck_offset = jcp_param->bck_offset;
-  double sum = bck_offset + fwd_offset;
 
   aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
                      ref_stride);
@@ -428,8 +425,7 @@
   for (i = 0; i < height; i++) {
     for (j = 0; j < width; j++) {
       int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
-      tmp = (int)(0.5 + tmp / sum);
-      if (tmp > 255) tmp = 255;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
       comp_pred[j] = (uint8_t)tmp;
     }
     comp_pred += width;
diff --git a/aom_dsp/x86/variance_ssse3.c b/aom_dsp/x86/variance_ssse3.c
new file mode 100644
index 0000000..ce573af
--- /dev/null
+++ b/aom_dsp/x86/variance_ssse3.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <tmmintrin.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "./av1_rtcd.h"
+
+#if CONFIG_JNT_COMP
+static void compute_jnt_comp_avg(__m128i *p0, __m128i *p1, const __m128i *w,
+                                 const __m128i *r, void *const result) {
+  __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
+  __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
+  __m128i round_lo = _mm_add_epi16(mult_lo, *r);
+  __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
+
+  __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
+  __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
+  __m128i round_hi = _mm_add_epi16(mult_hi, *r);
+  __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
+
+  xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
+}
+
+void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+                                 int width, int height, const uint8_t *ref,
+                                 int ref_stride,
+                                 const JNT_COMP_PARAMS *jcp_param) {
+  int i;
+  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
+  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+                                 w1, w0, w1, w0);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  if (width >= 16) {
+    // Read 16 pixels one row at a time
+    assert(!(width & 15));
+    for (i = 0; i < height; ++i) {
+      int j;
+      for (j = 0; j < width; j += 16) {
+        __m128i p0 = xx_loadu_128(ref);
+        __m128i p1 = xx_loadu_128(pred);
+
+        compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+        comp_pred += 16;
+        pred += 16;
+        ref += 16;
+      }
+      ref += ref_stride - width;
+    }
+  } else if (width >= 8) {
+    // Read 8 pixels two row at a time
+    assert(!(width & 7));
+    assert(!(width & 1));
+    for (i = 0; i < height; i += 2) {
+      __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
+      __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
+      __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
+      __m128i p1 = xx_loadu_128(pred);
+
+      compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+      comp_pred += 16;
+      pred += 16;
+      ref += 2 * ref_stride;
+    }
+  } else {
+    // Read 4 pixels four row at a time
+    assert(!(width & 3));
+    assert(!(height & 3));
+    for (i = 0; i < height; i += 4) {
+      __m128i p0_0 = xx_loadl_32(ref + 0 * ref_stride);
+      __m128i p0_1 = xx_loadl_32(ref + 1 * ref_stride);
+      __m128i p0_2 = xx_loadl_32(ref + 2 * ref_stride);
+      __m128i p0_3 = xx_loadl_32(ref + 3 * ref_stride);
+      __m128i p0 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(p0_0, p0_1),
+                                      _mm_unpacklo_epi32(p0_2, p0_3));
+      __m128i p1 = xx_loadu_128(pred);
+
+      compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+      comp_pred += 16;
+      pred += 16;
+      ref += 4 * ref_stride;
+    }
+  }
+}
+
+void aom_jnt_comp_avg_upsampled_pred_ssse3(uint8_t *comp_pred,
+                                           const uint8_t *pred, int width,
+                                           int height, int subpel_x_q3,
+                                           int subpel_y_q3, const uint8_t *ref,
+                                           int ref_stride,
+                                           const JNT_COMP_PARAMS *jcp_param) {
+  int n;
+  int i;
+  aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
+                     ref_stride);
+  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
+  assert(!(width * height & 15));
+  n = width * height >> 4;
+
+  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
+  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+                                 w1, w0, w1, w0);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  for (i = 0; i < n; i++) {
+    __m128i p0 = xx_loadu_128(comp_pred);
+    __m128i p1 = xx_loadu_128(pred);
+
+    compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+    comp_pred += 16;
+    pred += 16;
+  }
+}
+#endif  // CONFIG_JNT_COMP
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 80e7aff..81fcc4e 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -288,6 +288,9 @@
   set(AOM_AV1_COMMON_INTRIN_SSE2
       ${AOM_AV1_COMMON_INTRIN_SSE2}
       "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c")
+  set(AOM_AV1_COMMON_INTRIN_SSE4_1
+      ${AOM_AV1_COMMON_INTRIN_SSE4_1}
+      "${AOM_ROOT}/av1/common/x86/convolve_2d_sse4.c")
   if (CONFIG_HIGHBITDEPTH)
     set(AOM_AV1_COMMON_INTRIN_SSSE3
         ${AOM_AV1_COMMON_INTRIN_SSSE3}
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 43432c8..326e678 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -155,6 +155,7 @@
 
 ifeq ($(CONFIG_CONVOLVE_ROUND),yes)
 AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/convolve_2d_sse2.c
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/convolve_2d_sse4.c
 ifeq ($(CONFIG_HIGHBITDEPTH),yes)
 AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/highbd_convolve_2d_ssse3.c
 endif
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index b408db3..93642d9 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -557,6 +557,11 @@
     specialize qw/av1_convolve_2d_scale sse4_1/;
   }
 
+  if (aom_config("CONFIG_JNT_COMP") eq "yes") {
+    add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+    specialize qw/av1_jnt_convolve_2d sse4_1/;
+  }
+
   if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void av1_highbd_convolve_2d/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
     specialize qw/av1_highbd_convolve_2d ssse3/;
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 7ee6bc2..b09a60b 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -417,7 +417,54 @@
         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
       }
       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+      if (conv_params->do_average)
+        dst[y * dst_stride + x] += res;
+      else
+        dst[y * dst_stride + x] = res;
+    }
+  }
+}
+
 #if CONFIG_JNT_COMP
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride,
+                           CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
+                           InterpFilterParams *filter_params_x,
+                           InterpFilterParams *filter_params_y,
+                           const int subpel_x_q4, const int subpel_y_q4,
+                           ConvolveParams *conv_params) {
+  int x, y, k;
+  uint8_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = w;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+
+  // horizontal filter
+  const uint8_t *src_horiz = src - fo_vert * src_stride;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  for (y = 0; y < im_h; ++y) {
+    for (x = 0; x < w; ++x) {
+      int32_t sum = 0;
+      for (k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+      }
+      im_block[y * im_stride + x] =
+          clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
+    }
+  }
+
+  // vertical filter
+  uint8_t *src_vert = im_block + fo_vert * im_stride;
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      CONV_BUF_TYPE sum = 0;
+      for (k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+      }
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
       if (conv_params->bck_offset == -1) {
         if (conv_params->do_average)
           dst[y * dst_stride + x] += res;
@@ -432,15 +479,10 @@
           dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1);
         }
       }
-#else
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
-        dst[y * dst_stride + x] = res;
-#endif  // CONFIG_JNT_COMP
     }
   }
 }
+#endif  // CONFIG_JNT_COMP
 
 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
                              CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
@@ -571,7 +613,60 @@
       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
                           ((1 << (offset_bits - conv_params->round_1)) +
                            (1 << (offset_bits - conv_params->round_1 - 1)));
+      if (conv_params->do_average)
+        dst[y * dst_stride + x] += res;
+      else
+        dst[y * dst_stride + x] = res;
+    }
+  }
+}
+
 #if CONFIG_JNT_COMP
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride,
+                           CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
+                           InterpFilterParams *filter_params_x,
+                           InterpFilterParams *filter_params_y,
+                           const int subpel_x_q4, const int subpel_y_q4,
+                           ConvolveParams *conv_params) {
+  int x, y, k;
+  int32_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = w;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bd = 8;
+
+  // horizontal filter
+  const uint8_t *src_horiz = src - fo_vert * src_stride;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  for (y = 0; y < im_h; ++y) {
+    for (x = 0; x < w; ++x) {
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+      }
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+      im_block[y * im_stride + x] =
+          ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+    }
+  }
+
+  // vertical filter
+  int32_t *src_vert = im_block + fo_vert * im_stride;
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      CONV_BUF_TYPE sum = 1 << offset_bits;
+      for (k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+      }
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+                          ((1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1)));
       if (conv_params->fwd_offset == -1) {
         if (conv_params->do_average)
           dst[y * dst_stride + x] += res;
@@ -586,15 +681,10 @@
           dst[y * dst_stride + x] = res * conv_params->fwd_offset;
         }
       }
-#else
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
-        dst[y * dst_stride + x] = res;
-#endif  // CONFIG_JNT_COMP
     }
   }
 }
+#endif  // CONFIG_JNT_COMP
 
 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
                              CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
@@ -716,15 +806,15 @@
 // horizontal and vertical parameters are swapped because of the transpose
 #if CONFIG_JNT_COMP
     if (scaled)
-      av1_convolve_2d_scale_c(tr_src + fo_horiz * tr_src_stride + fo_vert,
-                              tr_src_stride, tr_dst, tr_dst_stride, h, w,
-                              &filter_params_y, &filter_params_x, subpel_y_q4,
-                              y_step_q4, subpel_x_q4, x_step_q4, conv_params);
+      av1_convolve_2d_scale(tr_src + fo_horiz * tr_src_stride + fo_vert,
+                            tr_src_stride, tr_dst, tr_dst_stride, h, w,
+                            &filter_params_y, &filter_params_x, subpel_y_q4,
+                            y_step_q4, subpel_x_q4, x_step_q4, conv_params);
     else
-      av1_convolve_2d_c(tr_src + fo_horiz * tr_src_stride + fo_vert,
-                        tr_src_stride, tr_dst, tr_dst_stride, h, w,
-                        &filter_params_y, &filter_params_x, subpel_y_q4,
-                        subpel_x_q4, conv_params);
+      av1_jnt_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
+                          tr_src_stride, tr_dst, tr_dst_stride, h, w,
+                          &filter_params_y, &filter_params_x, subpel_y_q4,
+                          subpel_x_q4, conv_params);
 #else
     if (scaled)
       av1_convolve_2d_scale(tr_src + fo_horiz * tr_src_stride + fo_vert,
@@ -742,15 +832,15 @@
   } else {
 #if CONFIG_JNT_COMP
     if (scaled)
-      av1_convolve_2d_scale_c(src, src_stride, conv_params->dst,
-                              conv_params->dst_stride, w, h, &filter_params_x,
-                              &filter_params_y, subpel_x_q4, x_step_q4,
-                              subpel_y_q4, y_step_q4, conv_params);
+      av1_convolve_2d_scale(src, src_stride, conv_params->dst,
+                            conv_params->dst_stride, w, h, &filter_params_x,
+                            &filter_params_y, subpel_x_q4, x_step_q4,
+                            subpel_y_q4, y_step_q4, conv_params);
     else
-      av1_convolve_2d_c(src, src_stride, conv_params->dst,
-                        conv_params->dst_stride, w, h, &filter_params_x,
-                        &filter_params_y, subpel_x_q4, subpel_y_q4,
-                        conv_params);
+      av1_jnt_convolve_2d(src, src_stride, conv_params->dst,
+                          conv_params->dst_stride, w, h, &filter_params_x,
+                          &filter_params_y, subpel_x_q4, subpel_y_q4,
+                          conv_params);
 #else
     if (scaled)
       av1_convolve_2d_scale(src, src_stride, conv_params->dst,
diff --git a/av1/common/x86/av1_convolve_scale_sse4.c b/av1/common/x86/av1_convolve_scale_sse4.c
index 1f0fedb..c877f64 100644
--- a/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/av1/common/x86/av1_convolve_scale_sse4.c
@@ -260,6 +260,12 @@
                          (1 << (offset_bits - conv_params->round_1 - 1)));
   const __m128i sub = _mm_set1_epi32(sub32);
 
+#if CONFIG_JNT_COMP
+  const __m128i fwd_offset = _mm_set1_epi32(conv_params->fwd_offset);
+  const __m128i bck_offset = _mm_set1_epi32(conv_params->bck_offset);
+  const __m128i jnt_round = _mm_set1_epi32(1 << (DIST_PRECISION_BITS - 2));
+#endif  // CONFIG_JNT_COMP
+
   int y_qn = subpel_y_qn;
   for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
     const int32_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
@@ -305,10 +311,29 @@
       const __m128i subbed = _mm_sub_epi32(shifted, sub);
 
       int32_t *dst_x = dst + y * dst_stride + x;
+#if CONFIG_JNT_COMP
+      __m128i result;
+      if (conv_params->fwd_offset != -1 && conv_params->bck_offset != -1) {
+        if (conv_params->do_average) {
+          result = _mm_srai_epi32(
+              _mm_add_epi32(_mm_add_epi32(_mm_loadu_si128((__m128i *)dst_x),
+                                          _mm_mullo_epi32(subbed, bck_offset)),
+                            jnt_round),
+              DIST_PRECISION_BITS - 1);
+        } else {
+          result = _mm_mullo_epi32(subbed, fwd_offset);
+        }
+      } else {
+        result = (conv_params->do_average)
+                     ? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
+                     : subbed;
+      }
+#else
       const __m128i result =
           (conv_params->do_average)
               ? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
               : subbed;
+#endif  // CONFIG_JNT_COMP
 
       _mm_storeu_si128((__m128i *)dst_x, result);
     }
@@ -317,10 +342,24 @@
       CONV_BUF_TYPE sum = 1 << offset_bits;
       for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - sub32;
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
-        dst[y * dst_stride + x] = res;
+#if CONFIG_JNT_COMP
+      if (conv_params->fwd_offset != -1 && conv_params->bck_offset != -1) {
+        if (conv_params->do_average) {
+          dst[y * dst_stride + x] += res * conv_params->bck_offset;
+
+          dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1);
+        } else {
+          dst[y * dst_stride + x] = res * conv_params->fwd_offset;
+        }
+      } else {
+#endif  // CONFIG_JNT_COMP
+        if (conv_params->do_average)
+          dst[y * dst_stride + x] += res;
+        else
+          dst[y * dst_stride + x] = res;
+#if CONFIG_JNT_COMP
+      }
+#endif  // CONFIG_JNT_COMP
     }
   }
 }
@@ -342,6 +381,12 @@
                          (1 << (offset_bits - conv_params->round_1 - 1)));
   const __m128i sub = _mm_set1_epi32(sub32);
 
+#if CONFIG_JNT_COMP
+  const __m128i fwd_offset = _mm_set1_epi32(conv_params->fwd_offset);
+  const __m128i bck_offset = _mm_set1_epi32(conv_params->bck_offset);
+  const __m128i jnt_round = _mm_set1_epi32(1 << (DIST_PRECISION_BITS - 2));
+#endif  // CONFIG_JNT_COMP
+
   int y_qn = subpel_y_qn;
   for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
     const int32_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
@@ -384,10 +429,29 @@
       const __m128i subbed = _mm_sub_epi32(shifted, sub);
 
       int32_t *dst_x = dst + y * dst_stride + x;
+#if CONFIG_JNT_COMP
+      __m128i result;
+      if (conv_params->fwd_offset != -1 && conv_params->bck_offset != -1) {
+        if (conv_params->do_average) {
+          result = _mm_srai_epi32(
+              _mm_add_epi32(_mm_add_epi32(_mm_loadu_si128((__m128i *)dst_x),
+                                          _mm_mullo_epi32(subbed, bck_offset)),
+                            jnt_round),
+              DIST_PRECISION_BITS - 1);
+        } else {
+          result = _mm_mullo_epi32(subbed, fwd_offset);
+        }
+      } else {
+        result = (conv_params->do_average)
+                     ? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
+                     : subbed;
+      }
+#else
       const __m128i result =
           (conv_params->do_average)
               ? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
               : subbed;
+#endif  // CONFIG_JNT_COMP
 
       _mm_storeu_si128((__m128i *)dst_x, result);
     }
@@ -396,10 +460,24 @@
       CONV_BUF_TYPE sum = 1 << offset_bits;
       for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - sub32;
-      if (conv_params->do_average)
-        dst[y * dst_stride + x] += res;
-      else
-        dst[y * dst_stride + x] = res;
+#if CONFIG_JNT_COMP
+      if (conv_params->fwd_offset != -1 && conv_params->bck_offset != -1) {
+        if (conv_params->do_average) {
+          dst[y * dst_stride + x] += res * conv_params->bck_offset;
+
+          dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1);
+        } else {
+          dst[y * dst_stride + x] = res * conv_params->fwd_offset;
+        }
+      } else {
+#endif  // CONFIG_JNT_COMP
+        if (conv_params->do_average)
+          dst[y * dst_stride + x] += res;
+        else
+          dst[y * dst_stride + x] = res;
+#if CONFIG_JNT_COMP
+      }
+#endif  // CONFIG_JNT_COMP
     }
   }
 }
diff --git a/av1/common/x86/convolve_2d_sse4.c b/av1/common/x86/convolve_2d_sse4.c
new file mode 100644
index 0000000..a3f4649
--- /dev/null
+++ b/av1/common/x86/convolve_2d_sse4.c
@@ -0,0 +1,440 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+#if CONFIG_JNT_COMP
+#if CONFIG_COMPOUND_ROUND
+void av1_jnt_convolve_2d_sse4_1(const uint8_t *src, int src_stride,
+                                CONV_BUF_TYPE *dst, int dst_stride, int w,
+                                int h, InterpFilterParams *filter_params_x,
+                                InterpFilterParams *filter_params_y,
+                                const int subpel_x_q4, const int subpel_y_q4,
+                                ConvolveParams *conv_params) {
+  DECLARE_ALIGNED(16, uint8_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = MAX_SB_SIZE;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int do_average = conv_params->do_average;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  const __m128i zero = _mm_setzero_si128();
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set_epi32(w0, w0, w0, w0);
+  const __m128i wt1 = _mm_set_epi32(w1, w1, w1, w1);
+
+  /* Horizontal filter */
+  {
+    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const =
+        _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+    for (i = 0; i < im_h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+        // Filter even-index pixels
+        const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even =
+            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        res = _mm_packus_epi16(res, res);
+        _mm_storel_epi64((__m128i *)&im_block[i * im_stride + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const =
+        _mm_set1_epi32((1 << conv_params->round_1) >> 1);
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const uint8_t *data = &im_block[i * im_stride + j];
+        const __m128i src_01 = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 0 * im_stride)),
+            _mm_loadl_epi64((__m128i *)(data + 1 * im_stride)));
+        const __m128i src_23 = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 2 * im_stride)),
+            _mm_loadl_epi64((__m128i *)(data + 3 * im_stride)));
+        const __m128i src_45 = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 4 * im_stride)),
+            _mm_loadl_epi64((__m128i *)(data + 5 * im_stride)));
+        const __m128i src_67 = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 6 * im_stride)),
+            _mm_loadl_epi64((__m128i *)(data + 7 * im_stride)));
+
+        const __m128i src_0 = _mm_unpacklo_epi8(src_01, zero);
+        const __m128i src_2 = _mm_unpacklo_epi8(src_23, zero);
+        const __m128i src_4 = _mm_unpacklo_epi8(src_45, zero);
+        const __m128i src_6 = _mm_unpacklo_epi8(src_67, zero);
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_unpackhi_epi8(src_01, zero);
+        const __m128i src_3 = _mm_unpackhi_epi8(src_23, zero);
+        const __m128i src_5 = _mm_unpackhi_epi8(src_45, zero);
+        const __m128i src_7 = _mm_unpackhi_epi8(src_67, zero);
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        const __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        if (conv_params->fwd_offset != -1 && conv_params->bck_offset != -1) {
+          // NOTE(chengchen):
+          // only this part is different from av1_convolve_2d_sse2
+          // original c function at: av1/common/convolve.c:
+          // av1_convolve_2d_c() and av1_jnt_convolve_2d_c()
+          __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+          if (do_average) {
+            _mm_storeu_si128(
+                p + 0, _mm_srai_epi32(
+                           _mm_add_epi32(_mm_loadu_si128(p + 0),
+                                         _mm_mullo_epi32(res_lo_round, wt1)),
+                           DIST_PRECISION_BITS - 1));
+
+            _mm_storeu_si128(
+                p + 1, _mm_srai_epi32(
+                           _mm_add_epi32(_mm_loadu_si128(p + 1),
+                                         _mm_mullo_epi32(res_hi_round, wt1)),
+                           DIST_PRECISION_BITS - 1));
+          } else {
+            _mm_storeu_si128(p + 0, _mm_mullo_epi32(res_lo_round, wt0));
+            _mm_storeu_si128(p + 1, _mm_mullo_epi32(res_hi_round, wt0));
+          }
+        } else {
+          // Accumulate values into the destination buffer
+          __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+          if (do_average) {
+            _mm_storeu_si128(
+                p + 0, _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
+            _mm_storeu_si128(
+                p + 1, _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+          } else {
+            _mm_storeu_si128(p + 0, res_lo_round);
+            _mm_storeu_si128(p + 1, res_hi_round);
+          }
+        }
+      }
+    }
+  }
+}
+#else   // CONFIG_COMPOUND_ROUND
+void av1_jnt_convolve_2d_sse4_1(const uint8_t *src, int src_stride,
+                                CONV_BUF_TYPE *dst, int dst_stride, int w,
+                                int h, InterpFilterParams *filter_params_x,
+                                InterpFilterParams *filter_params_y,
+                                const int subpel_x_q4, const int subpel_y_q4,
+                                ConvolveParams *conv_params) {
+  const int bd = 8;
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = MAX_SB_SIZE;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int do_average = conv_params->do_average;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  const __m128i zero = _mm_setzero_si128();
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set_epi32(w0, w0, w0, w0);
+  const __m128i wt1 = _mm_set_epi32(w1, w1, w1, w1);
+
+  /* Horizontal filter */
+  {
+    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+    for (i = 0; i < im_h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+        // Filter even-index pixels
+        const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even =
+            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_1) >> 1) -
+        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const int16_t *data = &im_block[i * im_stride + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        const __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        if (conv_params->fwd_offset != -1 && conv_params->bck_offset != -1) {
+          // FIXME(chengchen): validate this implementation
+          // original c function at: av1/common/convolve.c: av1_convolve_2d_c
+          __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+          if (do_average) {
+            _mm_storeu_si128(
+                p + 0, _mm_srai_epi32(
+                           _mm_add_epi32(_mm_loadu_si128(p + 0),
+                                         _mm_mullo_epi32(res_lo_round, wt1)),
+                           DIST_PRECISION_BITS - 1));
+
+            _mm_storeu_si128(
+                p + 1, _mm_srai_epi32(
+                           _mm_add_epi32(_mm_loadu_si128(p + 1),
+                                         _mm_mullo_epi32(res_hi_round, wt1)),
+                           DIST_PRECISION_BITS - 1));
+          } else {
+            _mm_storeu_si128(p + 0, _mm_mullo_epi32(res_lo_round, wt0));
+            _mm_storeu_si128(p + 1, _mm_mullo_epi32(res_hi_round, wt0));
+          }
+        } else {
+          // Accumulate values into the destination buffer
+          __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+          if (do_average) {
+            _mm_storeu_si128(
+                p + 0, _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
+            _mm_storeu_si128(
+                p + 1, _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+          } else {
+            _mm_storeu_si128(p + 0, res_lo_round);
+            _mm_storeu_si128(p + 1, res_hi_round);
+          }
+        }
+      }
+    }
+  }
+}
+#endif  // CONFIG_COMPOUND_ROUND
+#endif  // CONFIG_JNT_COMP