Add compute_stats SSE4_1 and AVX2 code
1. add compute_stats unittest
2. add compute_stats SSE4_1
3. add compute_stats AVX2
Encoder speedup about 0.9% without rd change
test sequence: BasketballDrill_832x480_50.y4m
test command line:./aomenc --cpu-used=1 --psnr -D \
-q --end-usage=vbr --target-bitrate=800 --limit=20 \
BasketballDrill_832x480_50.y4m -otest.webm
Change-Id: Ic0799997c1075a139869b45ba84af00c5475964a
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index d1ac8de..3b4831b 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -15,6 +15,7 @@
#include <math.h>
#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/binary_codes_writer.h"
@@ -22,7 +23,6 @@
#include "aom_mem/aom_mem.h"
#include "aom_ports/mem.h"
#include "aom_ports/system_state.h"
-
#include "av1/common/onyxc_int.h"
#include "av1/common/quant_common.h"
#include "av1/common/restoration.h"
@@ -588,22 +588,9 @@
if (cost_sgr < cost_none) rsc->sgrproj = rusi->sgrproj;
}
-static double find_average(const uint8_t *src, int h_start, int h_end,
- int v_start, int v_end, int stride) {
- uint64_t sum = 0;
- double avg = 0;
- int i, j;
- aom_clear_system_state();
- for (i = v_start; i < v_end; i++)
- for (j = h_start; j < h_end; j++) sum += src[i * stride + j];
- avg = (double)sum / ((v_end - v_start) * (h_end - h_start));
- return avg;
-}
-
-static void compute_stats(int wiener_win, const uint8_t *dgd,
- const uint8_t *src, int h_start, int h_end,
- int v_start, int v_end, int dgd_stride,
- int src_stride, double *M, double *H) {
+void compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
+ int h_start, int h_end, int v_start, int v_end,
+ int dgd_stride, int src_stride, double *M, double *H) {
int i, j, k, l;
double Y[WIENER_WIN2];
const int wiener_win2 = wiener_win * wiener_win;
@@ -626,8 +613,7 @@
assert(idx == wiener_win2);
for (k = 0; k < wiener_win2; ++k) {
M[k] += Y[k] * X;
- H[k * wiener_win2 + k] += Y[k] * Y[k];
- for (l = k + 1; l < wiener_win2; ++l) {
+ for (l = k; l < wiener_win2; ++l) {
// H is a symmetric matrix, so we only need to fill out the upper
// triangle here. We can copy it down to the lower triangle outside
// the (i, j) loops.