Optimize av1_jnt_convolve_2d_copy function

With shift, convolve copy no longer needs 32-bit multiplication of
two 8-bit numbers. Thus we can implement it with sse2 instead of
sse4.

Change-Id: I63e8ba414383a24f820bad4a6c607f222ec40ec2
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index abbcbf6..ea4fd32 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -602,7 +602,7 @@
 
     if (aom_config("CONFIG_COMPOUND_ROUND") ne "yes") {
       add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-      specialize qw/av1_jnt_convolve_2d_copy sse4_1/;
+      specialize qw/av1_jnt_convolve_2d_copy sse2/;
     }
   }
 
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 56c6d14..17df00f2 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -684,7 +684,7 @@
 
   for (int y = 0; y < h; ++y) {
     for (int x = 0; x < w; ++x) {
-      CONV_BUF_TYPE res = (1 << bits) * src[y * src_stride + x];
+      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
       if (conv_params->do_average)
         dst[y * dst_stride + x] += res;
       else
@@ -776,12 +776,14 @@
       CONV_BUF_TYPE res = (1 << bits) * src[y * src_stride + x];
       if (conv_params->use_jnt_comp_avg) {
         if (conv_params->do_average) {
-          dst[y * dst_stride + x] += res * conv_params->bck_offset;
+          dst[y * dst_stride + x] +=
+              (src[y * src_stride + x] * conv_params->bck_offset) << bits;
 
           dst[y * dst_stride + x] = ROUND_POWER_OF_TWO(dst[y * dst_stride + x],
                                                        DIST_PRECISION_BITS - 1);
         } else {
-          dst[y * dst_stride + x] = res * conv_params->fwd_offset;
+          dst[y * dst_stride + x] =
+              (src[y * src_stride + x] * conv_params->fwd_offset) << bits;
         }
       } else {
         if (conv_params->do_average)
diff --git a/av1/common/x86/convolve_2d_sse2.c b/av1/common/x86/convolve_2d_sse2.c
index 19f01be..13275b6 100644
--- a/av1/common/x86/convolve_2d_sse2.c
+++ b/av1/common/x86/convolve_2d_sse2.c
@@ -385,16 +385,17 @@
                                InterpFilterParams *filter_params_y,
                                const int subpel_x_q4, const int subpel_y_q4,
                                ConvolveParams *conv_params) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
   const int bits =
       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
   const int do_average = conv_params->do_average;
   const __m128i zero = _mm_setzero_si128();
   const __m128i left_shift = _mm_cvtsi32_si128(bits);
   int i, j;
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
 
   if (!(w % 16)) {
     for (i = 0; i < h; ++i) {
@@ -489,4 +490,212 @@
     }
   }
 }
-#endif
+
+#if CONFIG_JNT_COMP
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
+                                   CONV_BUF_TYPE *dst, int dst_stride, int w,
+                                   int h, InterpFilterParams *filter_params_x,
+                                   InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const int do_average = conv_params->do_average;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  int i, j;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+  const int jnt_round_const = 1 << (DIST_PRECISION_BITS - 2);
+  const __m128i jnt_r = _mm_set1_epi32(jnt_round_const);
+
+  if (!(w % 16)) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 16) {
+        const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]);
+        const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
+        const __m128i d16_1 = _mm_unpackhi_epi8(d8, zero);
+        __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero);
+        __m128i d32_1 = _mm_unpackhi_epi16(d16_0, zero);
+        __m128i d32_2 = _mm_unpacklo_epi16(d16_1, zero);
+        __m128i d32_3 = _mm_unpackhi_epi16(d16_1, zero);
+
+        __m128i *const p = (__m128i *)&dst[j];
+
+        if (conv_params->use_jnt_comp_avg) {
+          if (do_average) {
+            __m128i mul = _mm_mullo_epi16(d32_0, wt1);
+            __m128i weighted_res = _mm_sll_epi32(mul, left_shift);
+            __m128i sum = _mm_add_epi32(_mm_loadu_si128(p + 0), weighted_res);
+            d32_0 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r),
+                                   DIST_PRECISION_BITS - 1);
+
+            mul = _mm_mullo_epi16(d32_1, wt1);
+            weighted_res = _mm_sll_epi32(mul, left_shift);
+            sum = _mm_add_epi32(_mm_loadu_si128(p + 1), weighted_res);
+            d32_1 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r),
+                                   DIST_PRECISION_BITS - 1);
+
+            mul = _mm_mullo_epi16(d32_2, wt1);
+            weighted_res = _mm_sll_epi32(mul, left_shift);
+            sum = _mm_add_epi32(_mm_loadu_si128(p + 2), weighted_res);
+            d32_2 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r),
+                                   DIST_PRECISION_BITS - 1);
+
+            mul = _mm_mullo_epi16(d32_3, wt1);
+            weighted_res = _mm_sll_epi32(mul, left_shift);
+            sum = _mm_add_epi32(_mm_loadu_si128(p + 3), weighted_res);
+            d32_3 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r),
+                                   DIST_PRECISION_BITS - 1);
+          } else {
+            d32_0 = _mm_sll_epi32(_mm_mullo_epi16(d32_0, wt0), left_shift);
+            d32_1 = _mm_sll_epi32(_mm_mullo_epi16(d32_1, wt0), left_shift);
+            d32_2 = _mm_sll_epi32(_mm_mullo_epi16(d32_2, wt0), left_shift);
+            d32_3 = _mm_sll_epi32(_mm_mullo_epi16(d32_3, wt0), left_shift);
+          }
+        } else {
+          if (do_average) {
+            d32_0 = _mm_add_epi32(_mm_loadu_si128(p + 0),
+                                  _mm_sll_epi32(d32_0, left_shift));
+            d32_1 = _mm_add_epi32(_mm_loadu_si128(p + 1),
+                                  _mm_sll_epi32(d32_1, left_shift));
+            d32_2 = _mm_add_epi32(_mm_loadu_si128(p + 2),
+                                  _mm_sll_epi32(d32_2, left_shift));
+            d32_3 = _mm_add_epi32(_mm_loadu_si128(p + 3),
+                                  _mm_sll_epi32(d32_3, left_shift));
+          } else {
+            d32_0 = _mm_sll_epi32(d32_0, left_shift);
+            d32_1 = _mm_sll_epi32(d32_1, left_shift);
+            d32_2 = _mm_sll_epi32(d32_2, left_shift);
+            d32_3 = _mm_sll_epi32(d32_3, left_shift);
+          }
+        }
+
+        _mm_storeu_si128(p + 0, d32_0);
+        _mm_storeu_si128(p + 1, d32_1);
+        _mm_storeu_si128(p + 2, d32_2);
+        _mm_storeu_si128(p + 3, d32_3);
+      }
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w % 8)) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]);
+        const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
+        __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero);
+        __m128i d32_1 = _mm_unpackhi_epi16(d16_0, zero);
+
+        __m128i *const p = (__m128i *)&dst[j];
+        if (conv_params->use_jnt_comp_avg) {
+          if (do_average) {
+            __m128i mul = _mm_mullo_epi16(d32_0, wt1);
+            __m128i weighted_res = _mm_sll_epi32(mul, left_shift);
+            __m128i sum = _mm_add_epi32(_mm_loadu_si128(p + 0), weighted_res);
+            d32_0 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r),
+                                   DIST_PRECISION_BITS - 1);
+
+            mul = _mm_mullo_epi16(d32_1, wt1);
+            weighted_res = _mm_sll_epi32(mul, left_shift);
+            sum = _mm_add_epi32(_mm_loadu_si128(p + 1), weighted_res);
+            d32_1 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r),
+                                   DIST_PRECISION_BITS - 1);
+          } else {
+            d32_0 = _mm_sll_epi32(_mm_mullo_epi16(d32_0, wt0), left_shift);
+            d32_1 = _mm_sll_epi32(_mm_mullo_epi16(d32_1, wt0), left_shift);
+          }
+        } else {
+          if (do_average) {
+            d32_0 = _mm_add_epi32(_mm_loadu_si128(p + 0),
+                                  _mm_sll_epi32(d32_0, left_shift));
+            d32_1 = _mm_add_epi32(_mm_loadu_si128(p + 1),
+                                  _mm_sll_epi32(d32_1, left_shift));
+          } else {
+            d32_0 = _mm_sll_epi32(d32_0, left_shift);
+            d32_1 = _mm_sll_epi32(d32_1, left_shift);
+          }
+        }
+
+        _mm_storeu_si128(p + 0, d32_0);
+        _mm_storeu_si128(p + 1, d32_1);
+      }
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w % 4)) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 4) {
+        const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]);
+        const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
+        __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero);
+
+        __m128i *const p = (__m128i *)&dst[j];
+        if (conv_params->use_jnt_comp_avg) {
+          if (do_average) {
+            __m128i mul = _mm_mullo_epi16(d32_0, wt1);
+            __m128i weighted_res = _mm_sll_epi32(mul, left_shift);
+            __m128i sum = _mm_add_epi32(_mm_loadu_si128(p + 0), weighted_res);
+            d32_0 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r),
+                                   DIST_PRECISION_BITS - 1);
+          } else {
+            d32_0 = _mm_sll_epi32(_mm_mullo_epi16(d32_0, wt0), left_shift);
+          }
+        } else {
+          if (do_average) {
+            d32_0 = _mm_add_epi32(_mm_loadu_si128(p + 0),
+                                  _mm_sll_epi32(d32_0, left_shift));
+          } else {
+            d32_0 = _mm_sll_epi32(d32_0, left_shift);
+          }
+        }
+
+        _mm_storeu_si128(p, d32_0);
+      }
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 2) {
+        const __m128i d8 = _mm_cvtsi32_si128(*(const int *)&src[j]);
+        const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
+        __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero);
+
+        __m128i *const p = (__m128i *)&dst[j];
+        if (conv_params->use_jnt_comp_avg) {
+          if (do_average) {
+            __m128i mul = _mm_mullo_epi16(d32_0, wt1);
+            __m128i weighted_res = _mm_sll_epi32(mul, left_shift);
+            __m128i sum = _mm_add_epi32(_mm_loadl_epi64(p), weighted_res);
+            d32_0 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r),
+                                   DIST_PRECISION_BITS - 1);
+          } else {
+            d32_0 = _mm_sll_epi32(_mm_mullo_epi16(d32_0, wt0), left_shift);
+          }
+        } else {
+          if (do_average) {
+            d32_0 = _mm_add_epi32(_mm_loadl_epi64(p),
+                                  _mm_sll_epi32(d32_0, left_shift));
+          } else {
+            d32_0 = _mm_sll_epi32(d32_0, left_shift);
+          }
+        }
+
+        _mm_storel_epi64(p, d32_0);
+      }
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+}
+#endif  // CONFIG_JNT_COMP
+#endif  // CONFIG_COMPOUND_ROUND
diff --git a/av1/common/x86/convolve_2d_sse4.c b/av1/common/x86/convolve_2d_sse4.c
index 893036b..71c32e7 100644
--- a/av1/common/x86/convolve_2d_sse4.c
+++ b/av1/common/x86/convolve_2d_sse4.c
@@ -450,194 +450,5 @@
     }
   }
 }
-
-void av1_jnt_convolve_2d_copy_sse4_1(const uint8_t *src, int src_stride,
-                                     CONV_BUF_TYPE *dst, int dst_stride, int w,
-                                     int h, InterpFilterParams *filter_params_x,
-                                     InterpFilterParams *filter_params_y,
-                                     const int subpel_x_q4,
-                                     const int subpel_y_q4,
-                                     ConvolveParams *conv_params) {
-  const int bits =
-      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
-  const int do_average = conv_params->do_average;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i left_shift = _mm_cvtsi32_si128(bits);
-  int i, j;
-  (void)filter_params_x;
-  (void)filter_params_y;
-  (void)subpel_x_q4;
-  (void)subpel_y_q4;
-
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const __m128i wt0 = _mm_set1_epi32(w0);
-  const __m128i wt1 = _mm_set1_epi32(w1);
-  const int jnt_round_const = 1 << (DIST_PRECISION_BITS - 2);
-  const __m128i jnt_r = _mm_set1_epi32(jnt_round_const);
-
-  if (!(w % 16)) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 16) {
-        const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]);
-        const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
-        const __m128i d16_1 = _mm_unpackhi_epi8(d8, zero);
-        __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero);
-        __m128i d32_1 = _mm_unpackhi_epi16(d16_0, zero);
-        __m128i d32_2 = _mm_unpacklo_epi16(d16_1, zero);
-        __m128i d32_3 = _mm_unpackhi_epi16(d16_1, zero);
-
-        d32_0 = _mm_sll_epi32(d32_0, left_shift);
-        d32_1 = _mm_sll_epi32(d32_1, left_shift);
-        d32_2 = _mm_sll_epi32(d32_2, left_shift);
-        d32_3 = _mm_sll_epi32(d32_3, left_shift);
-
-        __m128i *const p = (__m128i *)&dst[j];
-
-        if (conv_params->use_jnt_comp_avg) {
-          if (do_average) {
-            __m128i weighted_res = _mm_mullo_epi32(d32_0, wt1);
-            __m128i sum = _mm_add_epi32(_mm_loadu_si128(p + 0), weighted_res);
-            d32_0 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r),
-                                   DIST_PRECISION_BITS - 1);
-
-            weighted_res = _mm_mullo_epi32(d32_1, wt1);
-            sum = _mm_add_epi32(_mm_loadu_si128(p + 1), weighted_res);
-            d32_1 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r),
-                                   DIST_PRECISION_BITS - 1);
-
-            weighted_res = _mm_mullo_epi32(d32_2, wt1);
-            sum = _mm_add_epi32(_mm_loadu_si128(p + 2), weighted_res);
-            d32_2 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r),
-                                   DIST_PRECISION_BITS - 1);
-
-            weighted_res = _mm_mullo_epi32(d32_3, wt1);
-            sum = _mm_add_epi32(_mm_loadu_si128(p + 3), weighted_res);
-            d32_3 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r),
-                                   DIST_PRECISION_BITS - 1);
-          } else {
-            d32_0 = _mm_mullo_epi32(d32_0, wt0);
-            d32_1 = _mm_mullo_epi32(d32_1, wt0);
-            d32_2 = _mm_mullo_epi32(d32_2, wt0);
-            d32_3 = _mm_mullo_epi32(d32_3, wt0);
-          }
-        } else {
-          if (do_average) {
-            d32_0 = _mm_add_epi32(_mm_loadu_si128(p + 0), d32_0);
-            d32_1 = _mm_add_epi32(_mm_loadu_si128(p + 1), d32_1);
-            d32_2 = _mm_add_epi32(_mm_loadu_si128(p + 2), d32_2);
-            d32_3 = _mm_add_epi32(_mm_loadu_si128(p + 3), d32_3);
-          }
-        }
-
-        _mm_storeu_si128(p + 0, d32_0);
-        _mm_storeu_si128(p + 1, d32_1);
-        _mm_storeu_si128(p + 2, d32_2);
-        _mm_storeu_si128(p + 3, d32_3);
-      }
-      src += src_stride;
-      dst += dst_stride;
-    }
-  } else if (!(w % 8)) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]);
-        const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
-        __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero);
-        __m128i d32_1 = _mm_unpackhi_epi16(d16_0, zero);
-
-        d32_0 = _mm_sll_epi32(d32_0, left_shift);
-        d32_1 = _mm_sll_epi32(d32_1, left_shift);
-
-        __m128i *const p = (__m128i *)&dst[j];
-        if (conv_params->use_jnt_comp_avg) {
-          if (do_average) {
-            __m128i weighted_res = _mm_mullo_epi32(d32_0, wt1);
-            __m128i sum = _mm_add_epi32(_mm_loadu_si128(p + 0), weighted_res);
-            d32_0 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r),
-                                   DIST_PRECISION_BITS - 1);
-
-            weighted_res = _mm_mullo_epi32(d32_1, wt1);
-            sum = _mm_add_epi32(_mm_loadu_si128(p + 1), weighted_res);
-            d32_1 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r),
-                                   DIST_PRECISION_BITS - 1);
-          } else {
-            d32_0 = _mm_mullo_epi32(d32_0, wt0);
-            d32_1 = _mm_mullo_epi32(d32_1, wt0);
-          }
-        } else {
-          if (do_average) {
-            d32_0 = _mm_add_epi32(_mm_loadu_si128(p + 0), d32_0);
-            d32_1 = _mm_add_epi32(_mm_loadu_si128(p + 1), d32_1);
-          }
-        }
-
-        _mm_storeu_si128(p + 0, d32_0);
-        _mm_storeu_si128(p + 1, d32_1);
-      }
-      src += src_stride;
-      dst += dst_stride;
-    }
-  } else if (!(w % 4)) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 4) {
-        const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]);
-        const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
-        __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero);
-
-        d32_0 = _mm_sll_epi32(d32_0, left_shift);
-
-        __m128i *const p = (__m128i *)&dst[j];
-        if (conv_params->use_jnt_comp_avg) {
-          if (do_average) {
-            __m128i weighted_res = _mm_mullo_epi32(d32_0, wt1);
-            __m128i sum = _mm_add_epi32(_mm_loadu_si128(p + 0), weighted_res);
-            d32_0 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r),
-                                   DIST_PRECISION_BITS - 1);
-          } else {
-            d32_0 = _mm_mullo_epi32(d32_0, wt0);
-          }
-        } else {
-          if (do_average) {
-            d32_0 = _mm_add_epi32(_mm_loadu_si128(p + 0), d32_0);
-          }
-        }
-
-        _mm_storeu_si128(p, d32_0);
-      }
-      src += src_stride;
-      dst += dst_stride;
-    }
-  } else {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 2) {
-        const __m128i d8 = _mm_cvtsi32_si128(*(const int *)&src[j]);
-        const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
-        __m128i d32_0 = _mm_unpacklo_epi16(d16_0, zero);
-
-        d32_0 = _mm_sll_epi32(d32_0, left_shift);
-        __m128i *const p = (__m128i *)&dst[j];
-        if (conv_params->use_jnt_comp_avg) {
-          if (do_average) {
-            __m128i weighted_res = _mm_mullo_epi32(d32_0, wt1);
-            __m128i sum = _mm_add_epi32(_mm_loadl_epi64(p), weighted_res);
-            d32_0 = _mm_srai_epi32(_mm_add_epi32(sum, jnt_r),
-                                   DIST_PRECISION_BITS - 1);
-          } else {
-            d32_0 = _mm_mullo_epi32(d32_0, wt0);
-          }
-        } else {
-          if (do_average) {
-            d32_0 = _mm_add_epi32(_mm_loadl_epi64(p), d32_0);
-          }
-        }
-
-        _mm_storel_epi64(p, d32_0);
-      }
-      src += src_stride;
-      dst += dst_stride;
-    }
-  }
-}
 #endif  // CONFIG_COMPOUND_ROUND
 #endif  // CONFIG_JNT_COMP