Cleanup obmc_sad function prototypes. Name 'wsrc', 'mask' and 'pre' explicitly, rather than using 'b', 'm' and 'a'. Change-Id: Iaee6d1ac1211b0b05b47cf98b50570089b12d600
diff --git a/test/obmc_sad_test.cc b/test/obmc_sad_test.cc index 9d8c2a2..95d56ae 100644 --- a/test/obmc_sad_test.cc +++ b/test/obmc_sad_test.cc
@@ -29,7 +29,7 @@ static const int kIterations = 1000; static const int kMaskMax = 64; -typedef unsigned int (*ObmcSadF)(const uint8_t *ref, int ref_stride, +typedef unsigned int (*ObmcSadF)(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask); //////////////////////////////////////////////////////////////////////////////// @@ -45,42 +45,42 @@ }; TEST_P(ObmcSadTest, RandomValues) { - DECLARE_ALIGNED(32, uint8_t, ref[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]); for (int iter = 0 ; iter < kIterations && !HasFatalFailure() ; ++iter) { - const int ref_stride = rng_(MAX_SB_SIZE + 1); + const int pre_stride = rng_(MAX_SB_SIZE + 1); for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) { - ref[i] = rng_.Rand8(); + pre[i] = rng_.Rand8(); wsrc[i] = rng_.Rand8() * rng_(kMaskMax * kMaskMax + 1); mask[i] = rng_(kMaskMax * kMaskMax + 1); } - const unsigned int ref_res = ref_func_(ref, ref_stride, wsrc, mask); - const unsigned int tst_res = tst_func_(ref, ref_stride, wsrc, mask); + const unsigned int ref_res = ref_func_(pre, pre_stride, wsrc, mask); + const unsigned int tst_res = tst_func_(pre, pre_stride, wsrc, mask); ASSERT_EQ(ref_res, tst_res); } } TEST_P(ObmcSadTest, ExtremeValues) { - DECLARE_ALIGNED(32, uint8_t, ref[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]); for (int iter = 0 ; iter < MAX_SB_SIZE && !HasFatalFailure() ; ++iter) { - const int ref_stride = iter; + const int pre_stride = iter; for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) { - ref[i] = UINT8_MAX; + pre[i] = UINT8_MAX; wsrc[i] = UINT8_MAX * kMaskMax * kMaskMax; mask[i] = kMaskMax * kMaskMax; } - const unsigned int ref_res = ref_func_(ref, ref_stride, wsrc, mask); - const unsigned int tst_res = tst_func_(ref, ref_stride, wsrc, mask); + const unsigned int ref_res = ref_func_(pre, pre_stride, wsrc, mask); + const unsigned int tst_res = tst_func_(pre, pre_stride, wsrc, mask); ASSERT_EQ(ref_res, tst_res); } @@ -126,22 +126,22 @@ }; TEST_P(ObmcSadHBDTest, RandomValues) { - DECLARE_ALIGNED(32, uint16_t, ref[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]); for (int iter = 0 ; iter < kIterations && !HasFatalFailure() ; ++iter) { - const int ref_stride = rng_(MAX_SB_SIZE + 1); + const int pre_stride = rng_(MAX_SB_SIZE + 1); for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) { - ref[i] = rng_(1<<12); + pre[i] = rng_(1<<12); wsrc[i] = rng_(1<<12) * rng_(kMaskMax * kMaskMax + 1); mask[i] = rng_(kMaskMax * kMaskMax + 1); } - const unsigned int ref_res = ref_func_(CONVERT_TO_BYTEPTR(ref), ref_stride, + const unsigned int ref_res = ref_func_(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask); - const unsigned int tst_res = tst_func_(CONVERT_TO_BYTEPTR(ref), ref_stride, + const unsigned int tst_res = tst_func_(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask); ASSERT_EQ(ref_res, tst_res); @@ -149,22 +149,22 @@ } TEST_P(ObmcSadHBDTest, ExtremeValues) { - DECLARE_ALIGNED(32, uint16_t, ref[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]); for (int iter = 0 ; iter < MAX_SB_SIZE && !HasFatalFailure() ; ++iter) { - const int ref_stride = iter; + const int pre_stride = iter; for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) { - ref[i] = (1 << 12) - 1; + pre[i] = (1 << 12) - 1; wsrc[i] = ((1 << 12) - 1) * kMaskMax * kMaskMax; mask[i] = kMaskMax * kMaskMax; } - const unsigned int ref_res = ref_func_(CONVERT_TO_BYTEPTR(ref), ref_stride, + const unsigned int ref_res = ref_func_(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask); - const unsigned int tst_res = tst_func_(CONVERT_TO_BYTEPTR(ref), ref_stride, + const unsigned int tst_res = tst_func_(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask); ASSERT_EQ(ref_res, tst_res);
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c index bb1daf8..e64dae3 100644 --- a/vpx_dsp/sad.c +++ b/vpx_dsp/sad.c
@@ -452,23 +452,23 @@ #endif // CONFIG_VP10 && CONFIG_EXT_INTER #if CONFIG_VP10 && CONFIG_OBMC -// a: pred -// b: target weighted prediction (has been *4096 to keep precision) -// m: 2d weights (scaled by 4096) -static INLINE unsigned int obmc_sad(const uint8_t *a, int a_stride, - const int32_t *b, - const int32_t *m, +// pre: predictor being evaluated +// wsrc: target weighted prediction (has been *4096 to keep precision) +// mask: 2d weights (scaled by 4096) +static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int width, int height) { int y, x; unsigned int sad = 0; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) - sad += ROUND_POWER_OF_TWO(abs(b[x] - a[x] * m[x]), 12); + sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12); - a += a_stride; - b += width; - m += width; + pre += pre_stride; + wsrc += width; + mask += width; } return sad; @@ -477,8 +477,8 @@ #define OBMCSADMxN(m, n) \ unsigned int vpx_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \ const int32_t *wsrc, \ - const int32_t *msk) { \ - return obmc_sad(ref, ref_stride, wsrc, msk, m, n); \ + const int32_t *mask) { \ + return obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ } #if CONFIG_EXT_PARTITION @@ -501,21 +501,21 @@ OBMCSADMxN(4, 4) #if CONFIG_VP9_HIGHBITDEPTH -static INLINE unsigned int highbd_obmc_sad(const uint8_t *a8, int a_stride, - const int32_t *b, - const int32_t *m, +static INLINE unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int width, int height) { int y, x; unsigned int sad = 0; - const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); for (y = 0; y < height; y++) { for (x = 0; x < width; x++) - sad += ROUND_POWER_OF_TWO(abs(b[x] - a[x] * m[x]), 12); + sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12); - a += a_stride; - b += width; - m += width; + pre += pre_stride; + wsrc += width; + mask += width; } return sad; @@ -525,8 +525,8 @@ unsigned int vpx_highbd_obmc_sad##m##x##n##_c(const uint8_t *ref, \ int ref_stride, \ const int32_t *wsrc, \ - const int32_t *msk) { \ - return highbd_obmc_sad(ref, ref_stride, wsrc, msk, m, n); \ + const int32_t *mask) { \ + return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ } #if CONFIG_EXT_PARTITION
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 02c8727..d8055e9 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1124,14 +1124,14 @@ if (vpx_config("CONFIG_OBMC") eq "yes") { foreach (@block_sizes) { ($w, $h) = @$_; - add_proto qw/unsigned int/, "vpx_obmc_sad${w}x${h}", "const uint8_t *ref_ptr, int ref_stride, const int32_t *wsrc_ptr, const int32_t *mask"; + add_proto qw/unsigned int/, "vpx_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; specialize "vpx_obmc_sad${w}x${h}", qw/sse4_1/; } if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { foreach (@block_sizes) { ($w, $h) = @$_; - add_proto qw/unsigned int/, "vpx_highbd_obmc_sad${w}x${h}", "const uint8_t *ref_ptr, int ref_stride, const int32_t *wsrc_ptr, const int32_t *mask"; + add_proto qw/unsigned int/, "vpx_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; specialize "vpx_highbd_obmc_sad${w}x${h}", qw/sse4_1/; } }
diff --git a/vpx_dsp/x86/obmc_sad_sse4.c b/vpx_dsp/x86/obmc_sad_sse4.c index 57e1428..de12e1d 100644 --- a/vpx_dsp/x86/obmc_sad_sse4.c +++ b/vpx_dsp/x86/obmc_sad_sse4.c
@@ -21,26 +21,28 @@ // 8 bit //////////////////////////////////////////////////////////////////////////////// -static INLINE unsigned int obmc_sad_w4(const uint8_t *a, const int a_stride, - const int32_t *b, const int32_t *m, +static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, const int height) { - const int a_step = a_stride - 4; + const int pre_step = pre_stride - 4; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); do { - const __m128i v_a_b = xx_loadl_32(a + n); - const __m128i v_m_d = xx_load_128(m + n); - const __m128i v_b_d = xx_load_128(b + n); + const __m128i v_p_b = xx_loadl_32(pre + n); + const __m128i v_m_d = xx_load_128(mask + n); + const __m128i v_w_d = xx_load_128(wsrc + n); - const __m128i v_a_d = _mm_cvtepu8_epi32(v_a_b); + const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); - // Values in both a and m fit in 15 bits, and are packed at 32 bit + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. - const __m128i v_am_d = _mm_madd_epi16(v_a_d, v_m_d); + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); - const __m128i v_diff_d = _mm_sub_epi32(v_b_d, v_am_d); + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); // Rounded absolute difference @@ -51,39 +53,42 @@ n += 4; if (n % 4 == 0) - a += a_step; + pre += pre_step; } while (n < 4 * height); return xx_hsum_epi32_si32(v_sad_d); } -static INLINE unsigned int obmc_sad_w8n(const uint8_t *a, const int a_stride, - const int32_t *b, const int32_t *m, - const int width, const int height) { - const int a_step = a_stride - width; +static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int width, + const int height) { + const int pre_step = pre_stride - width; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); assert(width >= 8 && (width & (width - 1)) == 0); do { - const __m128i v_a1_b = xx_loadl_32(a + n + 4); - const __m128i v_m1_d = xx_load_128(m + n + 4); - const __m128i v_b1_d = xx_load_128(b + n + 4); - const __m128i v_a0_b = xx_loadl_32(a + n); - const __m128i v_m0_d = xx_load_128(m + n); - const __m128i v_b0_d = xx_load_128(b + n); + const __m128i v_p1_b = xx_loadl_32(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_b = xx_loadl_32(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); - const __m128i v_a0_d = _mm_cvtepu8_epi32(v_a0_b); - const __m128i v_a1_d = _mm_cvtepu8_epi32(v_a1_b); + const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b); + const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b); - // Values in both a and m fit in 15 bits, and are packed at 32 bit + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. - const __m128i v_am0_d = _mm_madd_epi16(v_a0_d, v_m0_d); - const __m128i v_am1_d = _mm_madd_epi16(v_a1_d, v_m1_d); + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); - const __m128i v_diff0_d = _mm_sub_epi32(v_b0_d, v_am0_d); - const __m128i v_diff1_d = _mm_sub_epi32(v_b1_d, v_am1_d); + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); @@ -97,21 +102,21 @@ n += 8; if (n % width == 0) - a += a_step; + pre += pre_step; } while (n < width * height); return xx_hsum_epi32_si32(v_sad_d); } #define OBMCSADWXH(w, h) \ -unsigned int vpx_obmc_sad##w##x##h##_sse4_1(const uint8_t *ref, \ - int ref_stride, \ +unsigned int vpx_obmc_sad##w##x##h##_sse4_1(const uint8_t *pre, \ + int pre_stride, \ const int32_t *wsrc, \ const int32_t *msk) { \ if (w == 4) \ - return obmc_sad_w4(ref, ref_stride, wsrc, msk, h); \ + return obmc_sad_w4(pre, pre_stride, wsrc, msk, h); \ else \ - return obmc_sad_w8n(ref, ref_stride, wsrc, msk, w, h); \ + return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h); \ } #if CONFIG_EXT_PARTITION @@ -138,28 +143,29 @@ //////////////////////////////////////////////////////////////////////////////// #if CONFIG_VP9_HIGHBITDEPTH -static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *a8, - const int a_stride, - const int32_t *b, const int32_t *m, +static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, const int height) { - const uint16_t *a = CONVERT_TO_SHORTPTR(a8); - const int a_step = a_stride - 4; + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - 4; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); do { - const __m128i v_a_w = xx_loadl_64(a + n); - const __m128i v_m_d = xx_load_128(m + n); - const __m128i v_b_d = xx_load_128(b + n); + const __m128i v_p_w = xx_loadl_64(pre + n); + const __m128i v_m_d = xx_load_128(mask + n); + const __m128i v_w_d = xx_load_128(wsrc + n); - const __m128i v_a_d = _mm_cvtepu16_epi32(v_a_w); + const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w); - // Values in both a and m fit in 15 bits, and are packed at 32 bit + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. - const __m128i v_am_d = _mm_madd_epi16(v_a_d, v_m_d); + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); - const __m128i v_diff_d = _mm_sub_epi32(v_b_d, v_am_d); + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); // Rounded absolute difference @@ -170,41 +176,43 @@ n += 4; if (n % 4 == 0) - a += a_step; + pre += pre_step; } while (n < 4 * height); return xx_hsum_epi32_si32(v_sad_d); } -static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *a8, - const int a_stride, - const int32_t *b, const int32_t *m, - const int width, const int height) { - const uint16_t *a = CONVERT_TO_SHORTPTR(a8); - const int a_step = a_stride - width; +static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int width, + const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - width; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); assert(width >= 8 && (width & (width - 1)) == 0); do { - const __m128i v_a1_w = xx_loadl_64(a + n + 4); - const __m128i v_m1_d = xx_load_128(m + n + 4); - const __m128i v_b1_d = xx_load_128(b + n + 4); - const __m128i v_a0_w = xx_loadl_64(a + n); - const __m128i v_m0_d = xx_load_128(m + n); - const __m128i v_b0_d = xx_load_128(b + n); + const __m128i v_p1_w = xx_loadl_64(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_w = xx_loadl_64(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); - const __m128i v_a0_d = _mm_cvtepu16_epi32(v_a0_w); - const __m128i v_a1_d = _mm_cvtepu16_epi32(v_a1_w); + const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); + const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); - // Values in both a and m fit in 15 bits, and are packed at 32 bit + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. - const __m128i v_am0_d = _mm_madd_epi16(v_a0_d, v_m0_d); - const __m128i v_am1_d = _mm_madd_epi16(v_a1_d, v_m1_d); + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); - const __m128i v_diff0_d = _mm_sub_epi32(v_b0_d, v_am0_d); - const __m128i v_diff1_d = _mm_sub_epi32(v_b1_d, v_am1_d); + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); @@ -218,21 +226,21 @@ n += 8; if (n % width == 0) - a += a_step; + pre += pre_step; } while (n < width * height); return xx_hsum_epi32_si32(v_sad_d); } #define HBD_OBMCSADWXH(w, h) \ -unsigned int vpx_highbd_obmc_sad##w##x##h##_sse4_1(const uint8_t *ref, \ - int ref_stride, \ +unsigned int vpx_highbd_obmc_sad##w##x##h##_sse4_1(const uint8_t *pre, \ + int pre_stride, \ const int32_t *wsrc, \ - const int32_t *msk) { \ + const int32_t *mask) { \ if (w == 4) \ - return hbd_obmc_sad_w4(ref, ref_stride, wsrc, msk, h); \ + return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h); \ else \ - return hbd_obmc_sad_w8n(ref, ref_stride, wsrc, msk, w, h); \ + return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \ } #if CONFIG_EXT_PARTITION