Remove some unused functions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 14c5a9a..59cde14 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -418,10 +418,8 @@
- debug
- release
variables:
- CMAKE_FLAGS: >-
- -DENABLE_CCACHE=1
- -DCMAKE_C_COMPILER=clang
- -DCMAKE_CXX_COMPILER=clang++
+ EXTRA_CMAKE_FLAGS: >-
+ -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++
Example Build (x86_64-gcc):
stage: build
diff --git a/aom_dsp/aom_convolve.c b/aom_dsp/aom_convolve.c
index ef68c63..9463048 100644
--- a/aom_dsp/aom_convolve.c
+++ b/aom_dsp/aom_convolve.c
@@ -21,19 +21,6 @@
#include "aom_dsp/aom_filter.h"
#include "aom_ports/mem.h"
-static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
- int sum = 0;
- for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
- return sum;
-}
-
-static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride,
- const int16_t *b) {
- int sum = 0;
- for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
- return sum;
-}
-
static const InterpKernel *get_filter_base(const int16_t *filter) {
// NOTE: This assumes that the filter table is 256-byte aligned.
return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index adaf428..cc6f67d 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -16,56 +16,6 @@
#include "aom_dsp/x86/intrapred_x86.h"
#include "aom_dsp/x86/lpf_common_sse2.h"
-static INLINE __m256i dc_sum_64(const uint8_t *ref) {
- const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
- const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
- const __m256i zero = _mm256_setzero_si256();
- __m256i y0 = _mm256_sad_epu8(x0, zero);
- __m256i y1 = _mm256_sad_epu8(x1, zero);
- y0 = _mm256_add_epi64(y0, y1);
- __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
- y0 = _mm256_add_epi64(u0, y0);
- u0 = _mm256_unpackhi_epi64(y0, y0);
- return _mm256_add_epi16(y0, u0);
-}
-
-static INLINE __m256i dc_sum_32(const uint8_t *ref) {
- const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
- const __m256i zero = _mm256_setzero_si256();
- __m256i y = _mm256_sad_epu8(x, zero);
- __m256i u = _mm256_permute2x128_si256(y, y, 1);
- y = _mm256_add_epi64(u, y);
- u = _mm256_unpackhi_epi64(y, y);
- return _mm256_add_epi16(y, u);
-}
-
-static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
- ptrdiff_t stride) {
- for (int i = 0; i < height; ++i) {
- _mm256_storeu_si256((__m256i *)dst, *r);
- dst += stride;
- }
-}
-
-static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
- int height, uint8_t *dst,
- ptrdiff_t stride) {
- for (int i = 0; i < height; ++i) {
- _mm256_storeu_si256((__m256i *)dst, *r0);
- _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
- dst += stride;
- }
-}
-
-static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
- ptrdiff_t stride) {
- for (int i = 0; i < height; ++i) {
- _mm256_storeu_si256((__m256i *)dst, *r);
- _mm256_storeu_si256((__m256i *)(dst + 32), *r);
- dst += stride;
- }
-}
-
static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 06cf0eb..e1a3b3e 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -607,12 +607,6 @@
// --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
#define WIENER_MAX_EXT_SIZE 263
-static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
- int sum = 0;
- for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
- return sum;
-}
-
static INLINE int highbd_horz_scalar_product(const uint16_t *a,
const int16_t *b) {
int sum = 0;
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index ee7152c..5a8f14e 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -352,21 +352,6 @@
}
}
-static AOM_INLINE void diffwtd_mask(uint8_t *mask, int which_inverse,
- int mask_base, const uint8_t *src0,
- int src0_stride, const uint8_t *src1,
- int src1_stride, int h, int w) {
- int i, j, m, diff;
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- diff =
- abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]);
- m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
- mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
- }
- }
-}
-
static AOM_FORCE_INLINE void diffwtd_mask_highbd(
uint8_t *mask, int which_inverse, int mask_base, const uint16_t *src0,
int src0_stride, const uint16_t *src1, int src1_stride, int h, int w,
@@ -1066,25 +1051,6 @@
}
#if OPFL_COMBINE_INTERP_GRAD_LS
-static AOM_FORCE_INLINE void compute_pred_using_interp_grad(
- const uint8_t *src1, const uint8_t *src2, int16_t *dst1, int16_t *dst2,
- int bw, int bh, int d0, int d1) {
- for (int i = 0; i < bh; ++i) {
- for (int j = 0; j < bw; ++j) {
- // To avoid overflow, we clamp d0*P0-d1*P1 and P0-P1. Since d0 and d1 are
- // at most 5 bits, this clamping is only required in highbd, but it is
- // also added here for consistency.
- int32_t tmp_dst =
- d0 * (int32_t)src1[i * bw + j] - d1 * (int32_t)src2[i * bw + j];
- dst1[i * bw + j] = clamp(tmp_dst, INT16_MIN, INT16_MAX);
- tmp_dst = d0 * ((int32_t)src1[i * bw + j] - (int32_t)src2[i * bw + j]);
- dst2[i * bw + j] = clamp(tmp_dst, INT16_MIN, INT16_MAX);
- }
- }
-}
-#endif // OPFL_COMBINE_INTERP_GRAD_LS
-
-#if OPFL_COMBINE_INTERP_GRAD_LS
static AOM_FORCE_INLINE void compute_pred_using_interp_grad_highbd(
const uint16_t *src1, const uint16_t *src2, int16_t *dst1, int16_t *dst2,
int bw, int bh, int d0, int d1) {
diff --git a/av1/common/x86/cfl_ssse3.c b/av1/common/x86/cfl_ssse3.c
index ec7152d..64a8390 100644
--- a/av1/common/x86/cfl_ssse3.c
+++ b/av1/common/x86/cfl_ssse3.c
@@ -18,11 +18,6 @@
#include "av1/common/x86/cfl_simd.h"
-// Load 32-bit integer from memory into the first element of dst.
-static INLINE __m128i _mm_loadh_epi32(__m128i const *mem_addr) {
- return _mm_cvtsi32_si128(*((int *)mem_addr));
-}
-
// Store 32-bit integer from the first element of a into memory.
static INLINE void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) {
*((int *)mem_addr) = _mm_cvtsi128_si32(a);
diff --git a/av1/common/x86/optflow_refine_sse4.c b/av1/common/x86/optflow_refine_sse4.c
index 6a31b8f..5ebae7d 100644
--- a/av1/common/x86/optflow_refine_sse4.c
+++ b/av1/common/x86/optflow_refine_sse4.c
@@ -308,10 +308,6 @@
#endif // OPFL_BICUBIC_GRAD
}
-static INLINE __m128i LoadLo8(const void *a) {
- return _mm_loadl_epi64((const __m128i *)a);
-}
-
static INLINE __m128i LoadAligned16(const void *a) {
return _mm_load_si128((const __m128i *)a);
}
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index 24caca9..161a5ff 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -20,148 +20,6 @@
#include "av1/common/restoration.h"
#include "av1/encoder/pickrst.h"
-static INLINE void acc_stat_avx2(int32_t *dst, const uint8_t *src,
- const __m128i *shuffle, const __m256i *kl) {
- const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
- const __m256i d0 = _mm256_madd_epi16(*kl, _mm256_cvtepu8_epi16(s));
- const __m256i dst0 = yy_load_256(dst);
- const __m256i r0 = _mm256_add_epi32(dst0, d0);
- yy_store_256(dst, r0);
-}
-
-static INLINE void acc_stat_win7_one_line_avx2(
- const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
- int dgd_stride, const __m128i *shuffle, int32_t *sumX,
- int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
- int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
- int j, k, l;
- const int wiener_win = WIENER_WIN;
- // Main loop handles two pixels at a time
- // We can assume that h_start is even, since it will always be aligned to
- // a tile edge + some number of restoration units, and both of those will
- // be 64-pixel aligned.
- // However, at the edge of the image, h_end may be odd, so we need to handle
- // that case correctly.
- assert(h_start % 2 == 0);
- const int h_end_even = h_end & ~1;
- const int has_odd_pixel = h_end & 1;
- for (j = h_start; j < h_end_even; j += 2) {
- const uint8_t X1 = src[j];
- const uint8_t X2 = src[j + 1];
- *sumX += X1 + X2;
- const uint8_t *dgd_ij = dgd + j;
- for (k = 0; k < wiener_win; k++) {
- const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
- for (l = 0; l < wiener_win; l++) {
- int32_t *H_ = &H_int[(l * wiener_win + k)][0];
- const uint8_t D1 = dgd_ijk[l];
- const uint8_t D2 = dgd_ijk[l + 1];
- sumY[k][l] += D1 + D2;
- M_int[k][l] += D1 * X1 + D2 * X2;
-
- const __m256i kl =
- _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_uint16(dgd_ijk + l)));
- acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
- acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
- acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
- acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
- acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
- acc_stat_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
- acc_stat_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
- }
- }
- }
- // If the width is odd, add in the final pixel
- if (has_odd_pixel) {
- const uint8_t X1 = src[j];
- *sumX += X1;
- const uint8_t *dgd_ij = dgd + j;
- for (k = 0; k < wiener_win; k++) {
- const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
- for (l = 0; l < wiener_win; l++) {
- int32_t *H_ = &H_int[(l * wiener_win + k)][0];
- const uint8_t D1 = dgd_ijk[l];
- sumY[k][l] += D1;
- M_int[k][l] += D1 * X1;
-
- // The `acc_stat_avx2` function wants its input to have interleaved
- // copies of two pixels, but we only have one. However, the pixels
- // are (effectively) used as inputs to a multiply-accumulate.
- // So if we set the extra pixel slot to 0, then it is effectively
- // ignored.
- const __m256i kl = _mm256_cvtepu8_epi16(_mm_set1_epi16((uint16_t)D1));
- acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
- acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
- acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
- acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
- acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
- acc_stat_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
- acc_stat_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
- }
- }
- }
-}
-
-static INLINE void compute_stats_win7_opt_avx2(
- const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
- int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H) {
- int i, j, k, l, m, n;
- const int wiener_win = WIENER_WIN;
- const int pixel_count = (h_end - h_start) * (v_end - v_start);
- const int wiener_win2 = wiener_win * wiener_win;
- const int wiener_halfwin = (wiener_win >> 1);
- uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
-
- int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
- int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
-
- DECLARE_ALIGNED(32, int32_t,
- H_int32[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } };
- int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
- int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
- int32_t sumX = 0;
- const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
-
- const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
- for (j = v_start; j < v_end; j += 64) {
- const int vert_end = AOMMIN(64, v_end - j) + j;
- for (i = j; i < vert_end; i++) {
- acc_stat_win7_one_line_avx2(
- dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
- dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
- }
- for (k = 0; k < wiener_win; ++k) {
- for (l = 0; l < wiener_win; ++l) {
- M_int64[k][l] += M_int32[k][l];
- M_int32[k][l] = 0;
- }
- }
- for (k = 0; k < WIENER_WIN2; ++k) {
- for (l = 0; l < WIENER_WIN * 8; ++l) {
- H_int64[k][l] += H_int32[k][l];
- H_int32[k][l] = 0;
- }
- }
- }
-
- const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
- for (k = 0; k < wiener_win; k++) {
- for (l = 0; l < wiener_win; l++) {
- const int32_t idx0 = l * wiener_win + k;
- M[idx0] =
- M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]));
- int64_t *H_ = H + idx0 * wiener_win2;
- int64_t *H_int_ = &H_int64[idx0][0];
- for (m = 0; m < wiener_win; m++) {
- for (n = 0; n < wiener_win; n++) {
- H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
- (int64_t)avg * (sumY[k][l] + sumY[n][m]);
- }
- }
- }
- }
-}
-
static INLINE void acc_stat_highbd_avx2(int64_t *dst, const uint16_t *dgd,
const __m256i *shuffle,
const __m256i *dgd_ijkl) {
@@ -505,99 +363,6 @@
}
}
-// When params->r[0] > 0 and params->r[1] > 0. In this case all elements of
-// C and H need to be computed.
-static AOM_INLINE void calc_proj_params_r0_r1_avx2(
- const uint8_t *src8, int width, int height, int src_stride,
- const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
- int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
- const int size = width * height;
- const uint8_t *src = src8;
- const uint8_t *dat = dat8;
- __m256i h00, h01, h11, c0, c1;
- const __m256i zero = _mm256_setzero_si256();
- h01 = h11 = c0 = c1 = h00 = zero;
-
- for (int i = 0; i < height; ++i) {
- for (int j = 0; j < width; j += 8) {
- const __m256i u_load = _mm256_cvtepu8_epi32(
- _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
- const __m256i s_load = _mm256_cvtepu8_epi32(
- _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
- __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
- __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
- __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
- __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
- s = _mm256_sub_epi32(s, d);
- f1 = _mm256_sub_epi32(f1, d);
- f2 = _mm256_sub_epi32(f2, d);
-
- const __m256i h00_even = _mm256_mul_epi32(f1, f1);
- const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
- _mm256_srli_epi64(f1, 32));
- h00 = _mm256_add_epi64(h00, h00_even);
- h00 = _mm256_add_epi64(h00, h00_odd);
-
- const __m256i h01_even = _mm256_mul_epi32(f1, f2);
- const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
- _mm256_srli_epi64(f2, 32));
- h01 = _mm256_add_epi64(h01, h01_even);
- h01 = _mm256_add_epi64(h01, h01_odd);
-
- const __m256i h11_even = _mm256_mul_epi32(f2, f2);
- const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
- _mm256_srli_epi64(f2, 32));
- h11 = _mm256_add_epi64(h11, h11_even);
- h11 = _mm256_add_epi64(h11, h11_odd);
-
- const __m256i c0_even = _mm256_mul_epi32(f1, s);
- const __m256i c0_odd =
- _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
- c0 = _mm256_add_epi64(c0, c0_even);
- c0 = _mm256_add_epi64(c0, c0_odd);
-
- const __m256i c1_even = _mm256_mul_epi32(f2, s);
- const __m256i c1_odd =
- _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
- c1 = _mm256_add_epi64(c1, c1_even);
- c1 = _mm256_add_epi64(c1, c1_odd);
- }
- }
-
- __m256i c_low = _mm256_unpacklo_epi64(c0, c1);
- const __m256i c_high = _mm256_unpackhi_epi64(c0, c1);
- c_low = _mm256_add_epi64(c_low, c_high);
- const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1),
- _mm256_castsi256_si128(c_low));
-
- __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01);
- const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01);
- h0x_low = _mm256_add_epi64(h0x_low, h0x_high);
- const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1),
- _mm256_castsi256_si128(h0x_low));
-
- // Using the symmetric properties of H, calculations of H[1][0] are not
- // needed.
- __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11);
- const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11);
- h1x_low = _mm256_add_epi64(h1x_low, h1x_high);
- const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1),
- _mm256_castsi256_si128(h1x_low));
-
- xx_storeu_128(C, c_128bit);
- xx_storeu_128(H[0], h0x_128bit);
- xx_storeu_128(H[1], h1x_128bit);
-
- H[0][0] /= size;
- H[0][1] /= size;
- H[1][1] /= size;
-
- // Since H is a symmetric matrix
- H[1][0] = H[0][1];
- C[0] /= size;
- C[1] /= size;
-}
-
int64_t av1_highbd_pixel_proj_error_avx2(
const uint8_t *src8, int width, int height, int src_stride,
const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
diff --git a/av1/encoder/x86/pickrst_sse4.c b/av1/encoder/x86/pickrst_sse4.c
index f97ad56..5b454b7 100644
--- a/av1/encoder/x86/pickrst_sse4.c
+++ b/av1/encoder/x86/pickrst_sse4.c
@@ -18,152 +18,6 @@
#include "av1/common/restoration.h"
#include "av1/encoder/pickrst.h"
-static INLINE void acc_stat_sse41(int32_t *dst, const uint8_t *src,
- const __m128i *shuffle, const __m128i *kl) {
- const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
- const __m128i d0 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(s));
- const __m128i d1 =
- _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(_mm_srli_si128(s, 8)));
- const __m128i dst0 = xx_loadu_128(dst);
- const __m128i dst1 = xx_loadu_128(dst + 4);
- const __m128i r0 = _mm_add_epi32(dst0, d0);
- const __m128i r1 = _mm_add_epi32(dst1, d1);
- xx_storeu_128(dst, r0);
- xx_storeu_128(dst + 4, r1);
-}
-
-static INLINE void acc_stat_win7_one_line_sse4_1(
- const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
- int dgd_stride, const __m128i *shuffle, int32_t *sumX,
- int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
- int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
- const int wiener_win = 7;
- int j, k, l;
- // Main loop handles two pixels at a time
- // We can assume that h_start is even, since it will always be aligned to
- // a tile edge + some number of restoration units, and both of those will
- // be 64-pixel aligned.
- // However, at the edge of the image, h_end may be odd, so we need to handle
- // that case correctly.
- assert(h_start % 2 == 0);
- const int h_end_even = h_end & ~1;
- const int has_odd_pixel = h_end & 1;
- for (j = h_start; j < h_end_even; j += 2) {
- const uint8_t *dgd_ij = dgd + j;
- const uint8_t X1 = src[j];
- const uint8_t X2 = src[j + 1];
- *sumX += X1 + X2;
- for (k = 0; k < wiener_win; k++) {
- const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
- for (l = 0; l < wiener_win; l++) {
- int32_t *H_ = &H_int[(l * wiener_win + k)][0];
- const uint8_t D1 = dgd_ijk[l];
- const uint8_t D2 = dgd_ijk[l + 1];
- sumY[k][l] += D1 + D2;
- M_int[k][l] += D1 * X1 + D2 * X2;
-
- const __m128i kl =
- _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
- acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
- acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
- acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
- acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
- acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
- acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
- acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
- }
- }
- }
- // If the width is odd, add in the final pixel
- if (has_odd_pixel) {
- const uint8_t *dgd_ij = dgd + j;
- const uint8_t X1 = src[j];
- *sumX += X1;
- for (k = 0; k < wiener_win; k++) {
- const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
- for (l = 0; l < wiener_win; l++) {
- int32_t *H_ = &H_int[(l * wiener_win + k)][0];
- const uint8_t D1 = dgd_ijk[l];
- sumY[k][l] += D1;
- M_int[k][l] += D1 * X1;
-
- // The `acc_stat_sse41` function wants its input to have interleaved
- // copies of two pixels, but we only have one. However, the pixels
- // are (effectively) used as inputs to a multiply-accumulate.
- // So if we set the extra pixel slot to 0, then it is effectively
- // ignored.
- const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((uint16_t)D1));
- acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
- acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
- acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
- acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
- acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
- acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
- acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
- }
- }
- }
-}
-
-static INLINE void compute_stats_win7_opt_sse4_1(
- const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
- int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H) {
- int i, j, k, l, m, n;
- const int wiener_win = WIENER_WIN;
- const int pixel_count = (h_end - h_start) * (v_end - v_start);
- const int wiener_win2 = wiener_win * wiener_win;
- const int wiener_halfwin = (wiener_win >> 1);
- const uint8_t avg =
- find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
-
- int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
- int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
- int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
- int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
- int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
- int32_t sumX = 0;
- const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
-
- const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
- for (j = v_start; j < v_end; j += 64) {
- const int vert_end = AOMMIN(64, v_end - j) + j;
- for (i = j; i < vert_end; i++) {
- acc_stat_win7_one_line_sse4_1(
- dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
- dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
- }
- for (k = 0; k < wiener_win; ++k) {
- for (l = 0; l < wiener_win; ++l) {
- M_int64[k][l] += M_int32[k][l];
- M_int32[k][l] = 0;
- }
- }
- for (k = 0; k < WIENER_WIN2; ++k) {
- for (l = 0; l < WIENER_WIN * 8; ++l) {
- H_int64[k][l] += H_int32[k][l];
- H_int32[k][l] = 0;
- }
- }
- }
-
- const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
- for (k = 0; k < wiener_win; k++) {
- for (l = 0; l < wiener_win; l++) {
- const int32_t idx0 = l * wiener_win + k;
- M[idx0] =
- M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]));
- int64_t *H_ = H + idx0 * wiener_win2;
- int64_t *H_int_ = &H_int64[idx0][0];
- for (m = 0; m < wiener_win; m++) {
- for (n = 0; n < wiener_win; n++) {
- H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
- (int64_t)avg * (sumY[k][l] + sumY[n][m]);
- }
- }
- }
- }
-}
-
static INLINE void acc_stat_highbd_sse41(int64_t *dst, const uint16_t *dgd,
const __m128i *shuffle,
const __m128i *dgd_ijkl) {
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 785371a..1fa219d 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -769,8 +769,6 @@
////////////////////////////////////////////////////////////////////////////////
-static const int kMaskMax = 64;
-
typedef TestParams<ObmcSubpelVarFunc> ObmcSubpelVarianceParams;
template <typename FunctionType>