Optimize frame error module
Added AVX2 variant for frame error
For speed = 1 and 2 presets observed encode time reduction of
0.45% and 0.55%(averaged across multiple test cases).
Module gains improved by factor of ~2.5x w.r.t C code.
Change-Id: I0e356134fd3fb486e7b121f6c63e29efdc870ff9
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index ea56525..e69d275 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -336,6 +336,9 @@
add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
specialize qw/av1_highbd_warp_affine sse4_1/;
+add_proto qw/int64_t av1_calc_frame_error/, "const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride";
+specialize qw/av1_calc_frame_error avx2/;
+
if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/double av1_compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
specialize qw/av1_compute_cross_correlation sse4_1 avx2/;
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index a684e8e..aea9c32 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -22,76 +22,6 @@
#define WARP_ERROR_BLOCK 32
-/* clang-format off */
-static const int error_measure_lut[512] = {
- // pow 0.7
- 16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
- 16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
- 15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
- 15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
- 14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
- 14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
- 14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
- 13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
- 13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
- 12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
- 12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
- 12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
- 11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
- 11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
- 10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
- 10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
- 10058, 10002, 9947, 9891, 9835, 9779, 9723, 9666,
- 9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
- 9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
- 8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
- 8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
- 7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
- 7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
- 6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
- 6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
- 5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
- 5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
- 4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
- 3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
- 3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
- 2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
- 1323, 1187, 1045, 894, 731, 550, 339, 0,
- 339, 550, 731, 894, 1045, 1187, 1323, 1452,
- 1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
- 2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
- 3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
- 3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
- 4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
- 5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
- 5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
- 6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
- 6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
- 7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
- 7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
- 8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
- 8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
- 9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
- 9723, 9779, 9835, 9891, 9947, 10002, 10058, 10113,
- 10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
- 10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
- 11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
- 11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
- 11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
- 12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
- 12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
- 13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
- 13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
- 13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
- 14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
- 14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
- 15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
- 15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
- 15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
- 16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
-};
-/* clang-format on */
-
// For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
// at a time. The zoom/rotation/shear in the model are applied to the
// "fractional" position of each pixel, which therefore varies within
@@ -539,9 +469,11 @@
delta);
}
-static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
- const uint16_t *const dst, int p_width,
- int p_height, int p_stride, int bd) {
+static int64_t av1_calc_highbd_frame_error(const uint16_t *const ref,
+ int stride,
+ const uint16_t *const dst,
+ int p_width, int p_height,
+ int p_stride, int bd) {
int64_t sum_error = 0;
for (int i = 0; i < p_height; ++i) {
for (int j = 0; j < p_width; ++j) {
@@ -574,7 +506,7 @@
CONVERT_TO_BYTEPTR(tmp), j, i, warp_w, warp_h,
WARP_ERROR_BLOCK, subsampling_x, subsampling_y, bd,
&conv_params);
- gm_sumerr += highbd_frame_error(
+ gm_sumerr += av1_calc_highbd_frame_error(
tmp, WARP_ERROR_BLOCK, CONVERT_TO_SHORTPTR(dst8) + j + i * p_stride,
warp_w, warp_h, p_stride, bd);
if (gm_sumerr > best_error) return gm_sumerr;
@@ -583,10 +515,6 @@
return gm_sumerr;
}
-static INLINE int error_measure(int err) {
- return error_measure_lut[255 + err];
-}
-
/* The warp filter for ROTZOOM and AFFINE models works as follows:
* Split the input into 8x8 blocks
* For each block, project the point (4, 4) within the block, to get the
@@ -820,9 +748,9 @@
alpha, beta, gamma, delta);
}
-static int64_t frame_error(const uint8_t *const ref, int stride,
- const uint8_t *const dst, int p_width, int p_height,
- int p_stride) {
+int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride,
+ const uint8_t *const dst, int p_width,
+ int p_height, int p_stride) {
int64_t sum_error = 0;
for (int i = 0; i < p_height; ++i) {
for (int j = 0; j < p_width; ++j) {
@@ -856,7 +784,8 @@
warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params);
- gm_sumerr += frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
+ gm_sumerr +=
+ av1_calc_frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
warp_w, warp_h, p_stride);
if (gm_sumerr > best_error) return gm_sumerr;
}
@@ -867,11 +796,11 @@
int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
uint8_t *dst, int p_width, int p_height, int p_stride) {
if (use_hbd) {
- return highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
- CONVERT_TO_SHORTPTR(dst), p_width, p_height,
- p_stride, bd);
+ return av1_calc_highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
+ CONVERT_TO_SHORTPTR(dst), p_width,
+ p_height, p_stride, bd);
}
- return frame_error(ref, stride, dst, p_width, p_height, p_stride);
+ return av1_calc_frame_error(ref, stride, dst, p_width, p_height, p_stride);
}
int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
diff --git a/av1/common/warped_motion.h b/av1/common/warped_motion.h
index 8097693..d05d96d 100644
--- a/av1/common/warped_motion.h
+++ b/av1/common/warped_motion.h
@@ -37,6 +37,76 @@
DECLARE_ALIGNED(8, extern const int8_t,
av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]);
+/* clang-format off */
+static const int error_measure_lut[512] = {
+ // pow 0.7
+ 16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
+ 16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
+ 15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
+ 15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
+ 14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
+ 14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
+ 14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
+ 13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
+ 13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
+ 12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
+ 12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
+ 12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
+ 11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
+ 11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
+ 10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
+ 10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
+ 10058, 10002, 9947, 9891, 9835, 9779, 9723, 9666,
+ 9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
+ 9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
+ 8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
+ 8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
+ 7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
+ 7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
+ 6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
+ 6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
+ 5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
+ 5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
+ 4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
+ 3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
+ 3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
+ 2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
+ 1323, 1187, 1045, 894, 731, 550, 339, 0,
+ 339, 550, 731, 894, 1045, 1187, 1323, 1452,
+ 1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
+ 2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
+ 3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
+ 3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
+ 4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
+ 5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
+ 5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
+ 6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
+ 6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
+ 7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
+ 7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
+ 8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
+ 8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
+ 9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
+ 9723, 9779, 9835, 9891, 9947, 10002, 10058, 10113,
+ 10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
+ 10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
+ 11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
+ 11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
+ 11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
+ 12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
+ 12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
+ 13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
+ 13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
+ 13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
+ 14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
+ 14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
+ 15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
+ 15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
+ 15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
+ 16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
+};
+/* clang-format on */
+
static const uint8_t warp_pad_left[14][16] = {
{ 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
@@ -71,6 +141,10 @@
{ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
};
+static INLINE int error_measure(int err) {
+ return error_measure_lut[255 + err];
+}
+
// Returns the error between the result of applying motion 'wm' to the frame
// described by 'ref' and the frame described by 'dst'.
int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
diff --git a/av1/common/x86/warp_plane_avx2.c b/av1/common/x86/warp_plane_avx2.c
index 17ed2d4..75df1b0 100644
--- a/av1/common/x86/warp_plane_avx2.c
+++ b/av1/common/x86/warp_plane_avx2.c
@@ -12,6 +12,7 @@
#include <immintrin.h>
#include "config/av1_rtcd.h"
#include "av1/common/warped_motion.h"
+#include "aom_dsp/x86/synonyms.h"
DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = {
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
@@ -1001,6 +1002,116 @@
shuffle_src);
}
+int64_t av1_calc_frame_error_avx2(const uint8_t *const ref, int ref_stride,
+ const uint8_t *const dst, int p_width,
+ int p_height, int dst_stride) {
+ int64_t sum_error = 0;
+ int i, j;
+ __m256i row_error, col_error;
+ __m256i zero = _mm256_set1_epi16(0);
+ __m256i dup_255 = _mm256_set1_epi16(255);
+ col_error = zero;
+
+ for (i = 0; i < (p_height / 4); i++) {
+ row_error = _mm256_set1_epi16(0);
+ for (j = 0; j < (p_width / 16); j++) {
+ __m256i ref_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+ (__m128i *)(ref + (j * 16) + (((i * 4) + 0) * ref_stride))));
+ __m256i dst_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+ (__m128i *)(dst + (j * 16) + (((i * 4) + 0) * dst_stride))));
+ __m256i ref_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+ (__m128i *)(ref + (j * 16) + (((i * 4) + 1) * ref_stride))));
+ __m256i dst_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+ (__m128i *)(dst + (j * 16) + (((i * 4) + 1) * dst_stride))));
+ __m256i ref_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+ (__m128i *)(ref + (j * 16) + (((i * 4) + 2) * ref_stride))));
+ __m256i dst_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+ (__m128i *)(dst + (j * 16) + (((i * 4) + 2) * dst_stride))));
+ __m256i ref_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+ (__m128i *)(ref + (j * 16) + (((i * 4) + 3) * ref_stride))));
+ __m256i dst_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+ (__m128i *)(dst + (j * 16) + (((i * 4) + 3) * dst_stride))));
+
+ __m256i diff_1 =
+ _mm256_add_epi16(_mm256_sub_epi16(dst_1_16, ref_1_16), dup_255);
+ __m256i diff_2 =
+ _mm256_add_epi16(_mm256_sub_epi16(dst_2_16, ref_2_16), dup_255);
+ __m256i diff_3 =
+ _mm256_add_epi16(_mm256_sub_epi16(dst_3_16, ref_3_16), dup_255);
+ __m256i diff_4 =
+ _mm256_add_epi16(_mm256_sub_epi16(dst_4_16, ref_4_16), dup_255);
+
+ __m256i diff_1_lo = _mm256_unpacklo_epi16(diff_1, zero);
+ __m256i diff_1_hi = _mm256_unpackhi_epi16(diff_1, zero);
+ __m256i diff_2_lo = _mm256_unpacklo_epi16(diff_2, zero);
+ __m256i diff_2_hi = _mm256_unpackhi_epi16(diff_2, zero);
+ __m256i diff_3_lo = _mm256_unpacklo_epi16(diff_3, zero);
+ __m256i diff_3_hi = _mm256_unpackhi_epi16(diff_3, zero);
+ __m256i diff_4_lo = _mm256_unpacklo_epi16(diff_4, zero);
+ __m256i diff_4_hi = _mm256_unpackhi_epi16(diff_4, zero);
+
+ __m256i error_1_lo =
+ _mm256_i32gather_epi32(error_measure_lut, diff_1_lo, 4);
+ __m256i error_1_hi =
+ _mm256_i32gather_epi32(error_measure_lut, diff_1_hi, 4);
+ __m256i error_2_lo =
+ _mm256_i32gather_epi32(error_measure_lut, diff_2_lo, 4);
+ __m256i error_2_hi =
+ _mm256_i32gather_epi32(error_measure_lut, diff_2_hi, 4);
+ __m256i error_3_lo =
+ _mm256_i32gather_epi32(error_measure_lut, diff_3_lo, 4);
+ __m256i error_3_hi =
+ _mm256_i32gather_epi32(error_measure_lut, diff_3_hi, 4);
+ __m256i error_4_lo =
+ _mm256_i32gather_epi32(error_measure_lut, diff_4_lo, 4);
+ __m256i error_4_hi =
+ _mm256_i32gather_epi32(error_measure_lut, diff_4_hi, 4);
+
+ __m256i error_1 = _mm256_add_epi32(error_1_lo, error_1_hi);
+ __m256i error_2 = _mm256_add_epi32(error_2_lo, error_2_hi);
+ __m256i error_3 = _mm256_add_epi32(error_3_lo, error_3_hi);
+ __m256i error_4 = _mm256_add_epi32(error_4_lo, error_4_hi);
+
+ __m256i error_1_2 = _mm256_add_epi32(error_1, error_2);
+ __m256i error_3_4 = _mm256_add_epi32(error_3, error_4);
+
+ __m256i error_1_2_3_4 = _mm256_add_epi32(error_1_2, error_3_4);
+ row_error = _mm256_add_epi32(row_error, error_1_2_3_4);
+ }
+ __m256i col_error_lo = _mm256_unpacklo_epi32(row_error, zero);
+ __m256i col_error_hi = _mm256_unpackhi_epi32(row_error, zero);
+ __m256i col_error_temp = _mm256_add_epi64(col_error_lo, col_error_hi);
+ col_error = _mm256_add_epi64(col_error, col_error_temp);
+ // Error summation for remaining width, which is not multiple of 16
+ if (p_width & 0xf) {
+ for (int k = 0; k < 4; ++k) {
+ for (int l = j * 16; l < p_width; ++l) {
+ sum_error +=
+ (int64_t)error_measure(dst[l + ((i * 4) + k) * dst_stride] -
+ ref[l + ((i * 4) + k) * ref_stride]);
+ }
+ }
+ }
+ }
+ __m128i sum_error_q_0 = _mm256_castsi256_si128(col_error);
+ __m128i sum_error_q_1 = _mm256_extracti128_si256(col_error, 1);
+ sum_error_q_0 = _mm_add_epi64(sum_error_q_0, sum_error_q_1);
+ int64_t sum_error_d_0, sum_error_d_1;
+ xx_storel_64(&sum_error_d_0, sum_error_q_0);
+ xx_storel_64(&sum_error_d_1, _mm_srli_si128(sum_error_q_0, 8));
+ sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
+ // Error summation for remaining height, which is not multiple of 4
+ if (p_height & 0x3) {
+ for (int k = i * 4; k < p_height; ++k) {
+ for (int l = 0; l < p_width; ++l) {
+ sum_error += (int64_t)error_measure(dst[l + k * dst_stride] -
+ ref[l + k * ref_stride]);
+ }
+ }
+ }
+ return sum_error;
+}
+
void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width,
int height, int stride, uint8_t *pred, int p_col,
int p_row, int p_width, int p_height, int p_stride,
diff --git a/test/frame_error_test.cc b/test/frame_error_test.cc
new file mode 100644
index 0000000..69e0a87
--- /dev/null
+++ b/test/frame_error_test.cc
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace AV1FrameError {
+
+typedef int64_t (*frame_error_func)(const uint8_t *const ref, int stride,
+ const uint8_t *const dst, int p_width,
+ int p_height, int p_stride);
+
+const int kBlockWidth[] = {
+ 832, 834, 640, 1280, 1920,
+};
+
+const int kBlockHeight[] = {
+ 480, 482, 360, 720, 1080,
+};
+typedef ::testing::tuple<frame_error_func, int, int> FrameErrorParam;
+
+class AV1FrameErrorTest : public ::testing::TestWithParam<FrameErrorParam> {
+ public:
+ ~AV1FrameErrorTest();
+ void SetUp() { rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); }
+ void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+ void RandomValues(frame_error_func test_impl, int width, int height);
+ void ExtremeValues(frame_error_func test_impl, int width, int height);
+ void RunSpeedTest(frame_error_func test_impl, int width, int height);
+ libaom_test::ACMRandom rnd_;
+};
+AV1FrameErrorTest::~AV1FrameErrorTest() { ; }
+
+void AV1FrameErrorTest::RandomValues(frame_error_func test_impl, int width,
+ int height) {
+ int w = width;
+ int h = height;
+ const int stride = (((w * 3) / 2) + 15) & (~15);
+ const int max_blk_size = stride * h;
+ uint8_t *dst = (uint8_t *)aom_memalign(16, max_blk_size * sizeof(uint8_t));
+ uint8_t *ref = (uint8_t *)aom_memalign(16, max_blk_size * sizeof(uint8_t));
+ for (int i = 0; i < (max_blk_size); ++i) {
+ dst[i] = rnd_.Rand8();
+ ref[i] = rnd_.Rand8();
+ }
+ int64_t ref_error = av1_calc_frame_error_c(ref, stride, dst, w, h, stride);
+ int64_t test_error = test_impl(ref, stride, dst, w, h, stride);
+ ASSERT_EQ(test_error, ref_error) << w << "x" << h;
+ aom_free(dst);
+ aom_free(ref);
+}
+
+void AV1FrameErrorTest::ExtremeValues(frame_error_func test_impl, int width,
+ int height) {
+ int w = width;
+ int h = height;
+ const int stride = (((w * 3) / 2) + 15) & (~15);
+ const int max_blk_size = stride * h;
+ uint8_t *dst = (uint8_t *)aom_memalign(16, max_blk_size * sizeof(uint8_t));
+ uint8_t *ref = (uint8_t *)aom_memalign(16, max_blk_size * sizeof(uint8_t));
+ for (int r = 0; r < 2; r++) {
+ if (r == 0) {
+ memset(dst, 0, max_blk_size);
+ memset(ref, 255, max_blk_size);
+ } else if (r == 1) {
+ memset(dst, 255, max_blk_size);
+ memset(ref, 0, max_blk_size);
+ }
+ int64_t ref_error = av1_calc_frame_error_c(ref, stride, dst, w, h, stride);
+ int64_t test_error = test_impl(ref, stride, dst, w, h, stride);
+ ASSERT_EQ(test_error, ref_error) << w << "x" << h;
+ }
+ aom_free(dst);
+ aom_free(ref);
+}
+
+void AV1FrameErrorTest::RunSpeedTest(frame_error_func test_impl, int width,
+ int height) {
+ const int w = width;
+ const int h = height;
+ const int stride = (((w * 3) / 2) + 15) & (~15);
+ const int max_blk_size = stride * h;
+ uint8_t *dst = (uint8_t *)aom_memalign(16, max_blk_size * sizeof(uint8_t));
+ uint8_t *ref = (uint8_t *)aom_memalign(16, max_blk_size * sizeof(uint8_t));
+ for (int i = 0; i < (max_blk_size); ++i) {
+ dst[i] = ref[i] = rnd_.Rand8();
+ }
+ const int num_loops = 10000000 / (w + h);
+ frame_error_func funcs[2] = { av1_calc_frame_error_c, test_impl };
+ double elapsed_time[2] = { 0 };
+ for (int i = 0; i < 2; ++i) {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ frame_error_func func = funcs[i];
+ for (int j = 0; j < num_loops; ++j) {
+ func(ref, stride, dst, w, h, stride);
+ }
+ aom_usec_timer_mark(&timer);
+ double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ elapsed_time[i] = 1000.0 * time / num_loops;
+ }
+ aom_free(dst);
+ aom_free(ref);
+ printf("av1_calc_frame_error %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0],
+ elapsed_time[1]);
+ printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1FrameErrorTest, CheckOutput) {
+ RandomValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+ ExtremeValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+}
+
+TEST_P(AV1FrameErrorTest, DISABLED_Speed) {
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+}
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+ AVX2, AV1FrameErrorTest,
+ ::testing::Combine(::testing::Values(&av1_calc_frame_error_avx2),
+ ::testing::ValuesIn(kBlockWidth),
+ ::testing::ValuesIn(kBlockHeight)));
+#endif
+}; // namespace AV1FrameError
diff --git a/test/test.cmake b/test/test.cmake
index 9e55aea..c5c10e2 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -205,6 +205,7 @@
"${AOM_ROOT}/test/sum_squares_test.cc"
"${AOM_ROOT}/test/variance_test.cc"
"${AOM_ROOT}/test/wiener_test.cc"
+ "${AOM_ROOT}/test/frame_error_test.cc"
"${AOM_ROOT}/test/warp_filter_test.cc"
"${AOM_ROOT}/test/warp_filter_test_util.cc"
"${AOM_ROOT}/test/warp_filter_test_util.h")