SSE2 implementation of frame error module

Module gains improved by factor of ~1.13x w.r.t C code.

Change-Id: I716aafb132482a2fc2e15ea70fa94f7982cc440a
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 90952ae..cdab7f5 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -226,7 +226,8 @@
             "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse2.c"
             "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
             "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c"
-            "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h")
+            "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h"
+            "${AOM_ROOT}/av1/common/x86/warp_plane_sse2.c")
 
 list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
             "${AOM_ROOT}/av1/common/cdef_block_ssse3.c"
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index d1a38fa..0a38691 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -337,7 +337,7 @@
 specialize qw/av1_highbd_warp_affine sse4_1/;
 
 add_proto qw/int64_t av1_calc_frame_error/, "const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride";
-specialize qw/av1_calc_frame_error avx2/;
+specialize qw/av1_calc_frame_error sse2 avx2/;
 
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/double av1_compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c
new file mode 100644
index 0000000..6ff6665
--- /dev/null
+++ b/av1/common/x86/warp_plane_sse2.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/warped_motion.h"
+#include "config/av1_rtcd.h"
+
+int64_t av1_calc_frame_error_sse2(const uint8_t *const ref, int ref_stride,
+                                  const uint8_t *const dst, int p_width,
+                                  int p_height, int dst_stride) {
+  int64_t sum_error = 0;
+  int i, j;
+  __m128i row_error, col_error;
+  __m128i zero = _mm_set1_epi16(0);
+  __m128i dup_255 = _mm_set1_epi16(255);
+  col_error = zero;
+  for (i = 0; i < (p_height); i++) {
+    row_error = zero;
+    for (j = 0; j < (p_width / 16); j++) {
+      __m128i ref_8 =
+          _mm_load_si128((__m128i *)(ref + (j * 16) + (i * ref_stride)));
+      __m128i dst_8 =
+          _mm_load_si128((__m128i *)(dst + (j * 16) + (i * dst_stride)));
+      __m128i ref_16_lo = _mm_unpacklo_epi8(ref_8, zero);
+      __m128i ref_16_hi = _mm_unpackhi_epi8(ref_8, zero);
+      __m128i dst_16_lo = _mm_unpacklo_epi8(dst_8, zero);
+      __m128i dst_16_hi = _mm_unpackhi_epi8(dst_8, zero);
+
+      __m128i diff_1 =
+          _mm_add_epi16(_mm_sub_epi16(dst_16_lo, ref_16_lo), dup_255);
+      __m128i diff_2 =
+          _mm_add_epi16(_mm_sub_epi16(dst_16_hi, ref_16_hi), dup_255);
+
+      __m128i error_1_lo =
+          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 3)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 2)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 1)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 0)]);
+      __m128i error_1_hi =
+          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 7)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 6)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 5)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 4)]);
+      __m128i error_2_lo =
+          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 3)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 2)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 1)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 0)]);
+      __m128i error_2_hi =
+          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 7)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 6)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 5)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 4)]);
+
+      __m128i error_1 = _mm_add_epi32(error_1_lo, error_1_hi);
+      __m128i error_2 = _mm_add_epi32(error_2_lo, error_2_hi);
+      __m128i error_1_2 = _mm_add_epi32(error_1, error_2);
+
+      row_error = _mm_add_epi32(row_error, error_1_2);
+    }
+    __m128i col_error_lo = _mm_unpacklo_epi32(row_error, zero);
+    __m128i col_error_hi = _mm_unpackhi_epi32(row_error, zero);
+    __m128i col_error_temp = _mm_add_epi64(col_error_lo, col_error_hi);
+    col_error = _mm_add_epi64(col_error, col_error_temp);
+    // Error summation for remaining width, which is not multiple of 16
+    if (p_width & 0xf) {
+      for (int l = j * 16; l < p_width; ++l) {
+        sum_error += (int64_t)error_measure(dst[l + i * dst_stride] -
+                                            ref[l + i * ref_stride]);
+      }
+    }
+  }
+  int64_t sum_error_d_0, sum_error_d_1;
+  xx_storel_64(&sum_error_d_0, col_error);
+  xx_storel_64(&sum_error_d_1, _mm_srli_si128(col_error, 8));
+  sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
+  return sum_error;
+}
diff --git a/test/frame_error_test.cc b/test/frame_error_test.cc
index edfacf2..73b880a 100644
--- a/test/frame_error_test.cc
+++ b/test/frame_error_test.cc
@@ -27,7 +27,7 @@
 typedef int64_t (*frame_error_func)(const uint8_t *const ref, int stride,
                                     const uint8_t *const dst, int p_width,
                                     int p_height, int p_stride);
-#if HAVE_AVX2
+#if HAVE_AVX2 || HAVE_SSE2
 const int kBlockWidth[] = {
   832, 834, 640, 1280, 1920,
 };
@@ -145,6 +145,14 @@
   RunSpeedTest(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
 }
 
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, AV1FrameErrorTest,
+    ::testing::Combine(::testing::Values(&av1_calc_frame_error_sse2),
+                       ::testing::ValuesIn(kBlockWidth),
+                       ::testing::ValuesIn(kBlockHeight)));
+#endif
+
 #if HAVE_AVX2
 INSTANTIATE_TEST_CASE_P(
     AVX2, AV1FrameErrorTest,