Optimize frame error module

Added AVX2 variant for frame error

For speed = 1 and 2 presets observed encode time reduction of
0.45% and 0.55%(averaged across multiple test cases).

Module gains improved by factor of ~2.5x w.r.t C code.

Change-Id: I0e356134fd3fb486e7b121f6c63e29efdc870ff9
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index ea56525..e69d275 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -336,6 +336,9 @@
 add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
 specialize qw/av1_highbd_warp_affine sse4_1/;
 
+add_proto qw/int64_t av1_calc_frame_error/, "const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride";
+specialize qw/av1_calc_frame_error avx2/;
+
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/double av1_compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
   specialize qw/av1_compute_cross_correlation sse4_1 avx2/;
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index a684e8e..aea9c32 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -22,76 +22,6 @@
 
 #define WARP_ERROR_BLOCK 32
 
-/* clang-format off */
-static const int error_measure_lut[512] = {
-  // pow 0.7
-  16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
-  16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
-  15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
-  15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
-  14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
-  14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
-  14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
-  13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
-  13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
-  12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
-  12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
-  12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
-  11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
-  11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
-  10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
-  10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
-  10058, 10002,  9947,  9891,  9835,  9779,  9723,  9666,
-  9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
-  9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
-  8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
-  8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
-  7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
-  7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
-  6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
-  6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
-  5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
-  5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
-  4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
-  3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
-  3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
-  2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
-  1323, 1187, 1045,  894,  731,  550,  339,    0,
-  339,  550,  731,  894, 1045, 1187, 1323, 1452,
-  1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
-  2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
-  3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
-  3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
-  4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
-  5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
-  5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
-  6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
-  6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
-  7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
-  7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
-  8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
-  8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
-  9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
-  9723,  9779,  9835,  9891,  9947, 10002, 10058, 10113,
-  10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
-  10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
-  11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
-  11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
-  11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
-  12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
-  12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
-  13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
-  13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
-  13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
-  14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
-  14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
-  15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
-  15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
-  15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
-  16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
-};
-/* clang-format on */
-
 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
 // at a time. The zoom/rotation/shear in the model are applied to the
 // "fractional" position of each pixel, which therefore varies within
@@ -539,9 +469,11 @@
                          delta);
 }
 
-static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
-                                  const uint16_t *const dst, int p_width,
-                                  int p_height, int p_stride, int bd) {
+static int64_t av1_calc_highbd_frame_error(const uint16_t *const ref,
+                                           int stride,
+                                           const uint16_t *const dst,
+                                           int p_width, int p_height,
+                                           int p_stride, int bd) {
   int64_t sum_error = 0;
   for (int i = 0; i < p_height; ++i) {
     for (int j = 0; j < p_width; ++j) {
@@ -574,7 +506,7 @@
                         CONVERT_TO_BYTEPTR(tmp), j, i, warp_w, warp_h,
                         WARP_ERROR_BLOCK, subsampling_x, subsampling_y, bd,
                         &conv_params);
-      gm_sumerr += highbd_frame_error(
+      gm_sumerr += av1_calc_highbd_frame_error(
           tmp, WARP_ERROR_BLOCK, CONVERT_TO_SHORTPTR(dst8) + j + i * p_stride,
           warp_w, warp_h, p_stride, bd);
       if (gm_sumerr > best_error) return gm_sumerr;
@@ -583,10 +515,6 @@
   return gm_sumerr;
 }
 
-static INLINE int error_measure(int err) {
-  return error_measure_lut[255 + err];
-}
-
 /* The warp filter for ROTZOOM and AFFINE models works as follows:
    * Split the input into 8x8 blocks
    * For each block, project the point (4, 4) within the block, to get the
@@ -820,9 +748,9 @@
                   alpha, beta, gamma, delta);
 }
 
-static int64_t frame_error(const uint8_t *const ref, int stride,
-                           const uint8_t *const dst, int p_width, int p_height,
-                           int p_stride) {
+int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride,
+                               const uint8_t *const dst, int p_width,
+                               int p_height, int p_stride) {
   int64_t sum_error = 0;
   for (int i = 0; i < p_height; ++i) {
     for (int j = 0; j < p_width; ++j) {
@@ -856,7 +784,8 @@
       warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
                  WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params);
 
-      gm_sumerr += frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
+      gm_sumerr +=
+          av1_calc_frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
                                warp_w, warp_h, p_stride);
       if (gm_sumerr > best_error) return gm_sumerr;
     }
@@ -867,11 +796,11 @@
 int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
                         uint8_t *dst, int p_width, int p_height, int p_stride) {
   if (use_hbd) {
-    return highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
-                              CONVERT_TO_SHORTPTR(dst), p_width, p_height,
-                              p_stride, bd);
+    return av1_calc_highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
+                                       CONVERT_TO_SHORTPTR(dst), p_width,
+                                       p_height, p_stride, bd);
   }
-  return frame_error(ref, stride, dst, p_width, p_height, p_stride);
+  return av1_calc_frame_error(ref, stride, dst, p_width, p_height, p_stride);
 }
 
 int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
diff --git a/av1/common/warped_motion.h b/av1/common/warped_motion.h
index 8097693..d05d96d 100644
--- a/av1/common/warped_motion.h
+++ b/av1/common/warped_motion.h
@@ -37,6 +37,76 @@
 DECLARE_ALIGNED(8, extern const int8_t,
                 av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]);
 
+/* clang-format off */
+static const int error_measure_lut[512] = {
+    // pow 0.7
+    16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
+    16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
+    15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
+    15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
+    14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
+    14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
+    14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
+    13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
+    13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
+    12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
+    12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
+    12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
+    11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
+    11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
+    10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
+    10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
+    10058, 10002,  9947,  9891,  9835,  9779,  9723,  9666,
+    9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
+    9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
+    8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
+    8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
+    7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
+    7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
+    6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
+    6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
+    5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
+    5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
+    4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
+    3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
+    3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
+    2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
+    1323, 1187, 1045,  894,  731,  550,  339,    0,
+    339,  550,  731,  894, 1045, 1187, 1323, 1452,
+    1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
+    2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
+    3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
+    3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
+    4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
+    5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
+    5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
+    6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
+    6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
+    7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
+    7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
+    8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
+    8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
+    9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
+    9723,  9779,  9835,  9891,  9947, 10002, 10058, 10113,
+    10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
+    10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
+    11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
+    11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
+    11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
+    12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
+    12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
+    13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
+    13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
+    13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
+    14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
+    14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
+    15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
+    15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
+    15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
+    16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
+};
+/* clang-format on */
+
 static const uint8_t warp_pad_left[14][16] = {
   { 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
   { 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
@@ -71,6 +141,10 @@
   { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
 };
 
+static INLINE int error_measure(int err) {
+  return error_measure_lut[255 + err];
+}
+
 // Returns the error between the result of applying motion 'wm' to the frame
 // described by 'ref' and the frame described by 'dst'.
 int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
diff --git a/av1/common/x86/warp_plane_avx2.c b/av1/common/x86/warp_plane_avx2.c
index 17ed2d4..75df1b0 100644
--- a/av1/common/x86/warp_plane_avx2.c
+++ b/av1/common/x86/warp_plane_avx2.c
@@ -12,6 +12,7 @@
 #include <immintrin.h>
 #include "config/av1_rtcd.h"
 #include "av1/common/warped_motion.h"
+#include "aom_dsp/x86/synonyms.h"
 
 DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = {
   0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
@@ -1001,6 +1002,116 @@
                                 shuffle_src);
 }
 
+int64_t av1_calc_frame_error_avx2(const uint8_t *const ref, int ref_stride,
+                                  const uint8_t *const dst, int p_width,
+                                  int p_height, int dst_stride) {
+  int64_t sum_error = 0;
+  int i, j;
+  __m256i row_error, col_error;
+  __m256i zero = _mm256_set1_epi16(0);
+  __m256i dup_255 = _mm256_set1_epi16(255);
+  col_error = zero;
+
+  for (i = 0; i < (p_height / 4); i++) {
+    row_error = _mm256_set1_epi16(0);
+    for (j = 0; j < (p_width / 16); j++) {
+      __m256i ref_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(ref + (j * 16) + (((i * 4) + 0) * ref_stride))));
+      __m256i dst_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(dst + (j * 16) + (((i * 4) + 0) * dst_stride))));
+      __m256i ref_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(ref + (j * 16) + (((i * 4) + 1) * ref_stride))));
+      __m256i dst_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(dst + (j * 16) + (((i * 4) + 1) * dst_stride))));
+      __m256i ref_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(ref + (j * 16) + (((i * 4) + 2) * ref_stride))));
+      __m256i dst_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(dst + (j * 16) + (((i * 4) + 2) * dst_stride))));
+      __m256i ref_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(ref + (j * 16) + (((i * 4) + 3) * ref_stride))));
+      __m256i dst_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(dst + (j * 16) + (((i * 4) + 3) * dst_stride))));
+
+      __m256i diff_1 =
+          _mm256_add_epi16(_mm256_sub_epi16(dst_1_16, ref_1_16), dup_255);
+      __m256i diff_2 =
+          _mm256_add_epi16(_mm256_sub_epi16(dst_2_16, ref_2_16), dup_255);
+      __m256i diff_3 =
+          _mm256_add_epi16(_mm256_sub_epi16(dst_3_16, ref_3_16), dup_255);
+      __m256i diff_4 =
+          _mm256_add_epi16(_mm256_sub_epi16(dst_4_16, ref_4_16), dup_255);
+
+      __m256i diff_1_lo = _mm256_unpacklo_epi16(diff_1, zero);
+      __m256i diff_1_hi = _mm256_unpackhi_epi16(diff_1, zero);
+      __m256i diff_2_lo = _mm256_unpacklo_epi16(diff_2, zero);
+      __m256i diff_2_hi = _mm256_unpackhi_epi16(diff_2, zero);
+      __m256i diff_3_lo = _mm256_unpacklo_epi16(diff_3, zero);
+      __m256i diff_3_hi = _mm256_unpackhi_epi16(diff_3, zero);
+      __m256i diff_4_lo = _mm256_unpacklo_epi16(diff_4, zero);
+      __m256i diff_4_hi = _mm256_unpackhi_epi16(diff_4, zero);
+
+      __m256i error_1_lo =
+          _mm256_i32gather_epi32(error_measure_lut, diff_1_lo, 4);
+      __m256i error_1_hi =
+          _mm256_i32gather_epi32(error_measure_lut, diff_1_hi, 4);
+      __m256i error_2_lo =
+          _mm256_i32gather_epi32(error_measure_lut, diff_2_lo, 4);
+      __m256i error_2_hi =
+          _mm256_i32gather_epi32(error_measure_lut, diff_2_hi, 4);
+      __m256i error_3_lo =
+          _mm256_i32gather_epi32(error_measure_lut, diff_3_lo, 4);
+      __m256i error_3_hi =
+          _mm256_i32gather_epi32(error_measure_lut, diff_3_hi, 4);
+      __m256i error_4_lo =
+          _mm256_i32gather_epi32(error_measure_lut, diff_4_lo, 4);
+      __m256i error_4_hi =
+          _mm256_i32gather_epi32(error_measure_lut, diff_4_hi, 4);
+
+      __m256i error_1 = _mm256_add_epi32(error_1_lo, error_1_hi);
+      __m256i error_2 = _mm256_add_epi32(error_2_lo, error_2_hi);
+      __m256i error_3 = _mm256_add_epi32(error_3_lo, error_3_hi);
+      __m256i error_4 = _mm256_add_epi32(error_4_lo, error_4_hi);
+
+      __m256i error_1_2 = _mm256_add_epi32(error_1, error_2);
+      __m256i error_3_4 = _mm256_add_epi32(error_3, error_4);
+
+      __m256i error_1_2_3_4 = _mm256_add_epi32(error_1_2, error_3_4);
+      row_error = _mm256_add_epi32(row_error, error_1_2_3_4);
+    }
+    __m256i col_error_lo = _mm256_unpacklo_epi32(row_error, zero);
+    __m256i col_error_hi = _mm256_unpackhi_epi32(row_error, zero);
+    __m256i col_error_temp = _mm256_add_epi64(col_error_lo, col_error_hi);
+    col_error = _mm256_add_epi64(col_error, col_error_temp);
+    // Error summation for remaining width, which is not multiple of 16
+    if (p_width & 0xf) {
+      for (int k = 0; k < 4; ++k) {
+        for (int l = j * 16; l < p_width; ++l) {
+          sum_error +=
+              (int64_t)error_measure(dst[l + ((i * 4) + k) * dst_stride] -
+                                     ref[l + ((i * 4) + k) * ref_stride]);
+        }
+      }
+    }
+  }
+  __m128i sum_error_q_0 = _mm256_castsi256_si128(col_error);
+  __m128i sum_error_q_1 = _mm256_extracti128_si256(col_error, 1);
+  sum_error_q_0 = _mm_add_epi64(sum_error_q_0, sum_error_q_1);
+  int64_t sum_error_d_0, sum_error_d_1;
+  xx_storel_64(&sum_error_d_0, sum_error_q_0);
+  xx_storel_64(&sum_error_d_1, _mm_srli_si128(sum_error_q_0, 8));
+  sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
+  // Error summation for remaining height, which is not multiple of 4
+  if (p_height & 0x3) {
+    for (int k = i * 4; k < p_height; ++k) {
+      for (int l = 0; l < p_width; ++l) {
+        sum_error += (int64_t)error_measure(dst[l + k * dst_stride] -
+                                            ref[l + k * ref_stride]);
+      }
+    }
+  }
+  return sum_error;
+}
+
 void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width,
                           int height, int stride, uint8_t *pred, int p_col,
                           int p_row, int p_width, int p_height, int p_stride,
diff --git a/test/frame_error_test.cc b/test/frame_error_test.cc
new file mode 100644
index 0000000..69e0a87
--- /dev/null
+++ b/test/frame_error_test.cc
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace AV1FrameError {
+
+typedef int64_t (*frame_error_func)(const uint8_t *const ref, int stride,
+                                    const uint8_t *const dst, int p_width,
+                                    int p_height, int p_stride);
+
+const int kBlockWidth[] = {
+  832, 834, 640, 1280, 1920,
+};
+
+const int kBlockHeight[] = {
+  480, 482, 360, 720, 1080,
+};
+typedef ::testing::tuple<frame_error_func, int, int> FrameErrorParam;
+
+class AV1FrameErrorTest : public ::testing::TestWithParam<FrameErrorParam> {
+ public:
+  ~AV1FrameErrorTest();
+  void SetUp() { rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); }
+  void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RandomValues(frame_error_func test_impl, int width, int height);
+  void ExtremeValues(frame_error_func test_impl, int width, int height);
+  void RunSpeedTest(frame_error_func test_impl, int width, int height);
+  libaom_test::ACMRandom rnd_;
+};
+AV1FrameErrorTest::~AV1FrameErrorTest() { ; }
+
+void AV1FrameErrorTest::RandomValues(frame_error_func test_impl, int width,
+                                     int height) {
+  int w = width;
+  int h = height;
+  const int stride = (((w * 3) / 2) + 15) & (~15);
+  const int max_blk_size = stride * h;
+  uint8_t *dst = (uint8_t *)aom_memalign(16, max_blk_size * sizeof(uint8_t));
+  uint8_t *ref = (uint8_t *)aom_memalign(16, max_blk_size * sizeof(uint8_t));
+  for (int i = 0; i < (max_blk_size); ++i) {
+    dst[i] = rnd_.Rand8();
+    ref[i] = rnd_.Rand8();
+  }
+  int64_t ref_error = av1_calc_frame_error_c(ref, stride, dst, w, h, stride);
+  int64_t test_error = test_impl(ref, stride, dst, w, h, stride);
+  ASSERT_EQ(test_error, ref_error) << w << "x" << h;
+  aom_free(dst);
+  aom_free(ref);
+}
+
+void AV1FrameErrorTest::ExtremeValues(frame_error_func test_impl, int width,
+                                      int height) {
+  int w = width;
+  int h = height;
+  const int stride = (((w * 3) / 2) + 15) & (~15);
+  const int max_blk_size = stride * h;
+  uint8_t *dst = (uint8_t *)aom_memalign(16, max_blk_size * sizeof(uint8_t));
+  uint8_t *ref = (uint8_t *)aom_memalign(16, max_blk_size * sizeof(uint8_t));
+  for (int r = 0; r < 2; r++) {
+    if (r == 0) {
+      memset(dst, 0, max_blk_size);
+      memset(ref, 255, max_blk_size);
+    } else if (r == 1) {
+      memset(dst, 255, max_blk_size);
+      memset(ref, 0, max_blk_size);
+    }
+    int64_t ref_error = av1_calc_frame_error_c(ref, stride, dst, w, h, stride);
+    int64_t test_error = test_impl(ref, stride, dst, w, h, stride);
+    ASSERT_EQ(test_error, ref_error) << w << "x" << h;
+  }
+  aom_free(dst);
+  aom_free(ref);
+}
+
+void AV1FrameErrorTest::RunSpeedTest(frame_error_func test_impl, int width,
+                                     int height) {
+  const int w = width;
+  const int h = height;
+  const int stride = (((w * 3) / 2) + 15) & (~15);
+  const int max_blk_size = stride * h;
+  uint8_t *dst = (uint8_t *)aom_memalign(16, max_blk_size * sizeof(uint8_t));
+  uint8_t *ref = (uint8_t *)aom_memalign(16, max_blk_size * sizeof(uint8_t));
+  for (int i = 0; i < (max_blk_size); ++i) {
+    dst[i] = ref[i] = rnd_.Rand8();
+  }
+  const int num_loops = 10000000 / (w + h);
+  frame_error_func funcs[2] = { av1_calc_frame_error_c, test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    frame_error_func func = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      func(ref, stride, dst, w, h, stride);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  aom_free(dst);
+  aom_free(ref);
+  printf("av1_calc_frame_error %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0],
+         elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1FrameErrorTest, CheckOutput) {
+  RandomValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+  ExtremeValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+}
+
+TEST_P(AV1FrameErrorTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+}
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, AV1FrameErrorTest,
+    ::testing::Combine(::testing::Values(&av1_calc_frame_error_avx2),
+                       ::testing::ValuesIn(kBlockWidth),
+                       ::testing::ValuesIn(kBlockHeight)));
+#endif
+};  // namespace AV1FrameError
diff --git a/test/test.cmake b/test/test.cmake
index 9e55aea..c5c10e2 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -205,6 +205,7 @@
               "${AOM_ROOT}/test/sum_squares_test.cc"
               "${AOM_ROOT}/test/variance_test.cc"
               "${AOM_ROOT}/test/wiener_test.cc"
+              "${AOM_ROOT}/test/frame_error_test.cc"
               "${AOM_ROOT}/test/warp_filter_test.cc"
               "${AOM_ROOT}/test/warp_filter_test_util.cc"
               "${AOM_ROOT}/test/warp_filter_test_util.h")