Make parameter names consistent BUG=aomedia:2228 Change-Id: If701f16aec272e1df43174fb39d34c5a0f69babb
diff --git a/aom/src/aom_encoder.c b/aom/src/aom_encoder.c index 01917c9..7270797 100644 --- a/aom/src/aom_encoder.c +++ b/aom/src/aom_encoder.c
@@ -144,12 +144,12 @@ aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface, aom_codec_enc_cfg_t *cfg, - unsigned int usage) { + unsigned int reserved) { aom_codec_err_t res; aom_codec_enc_cfg_map_t *map; int i; - if (!iface || !cfg || usage > INT_MAX) + if (!iface || !cfg || reserved > INT_MAX) res = AOM_CODEC_INVALID_PARAM; else if (!(iface->caps & AOM_CODEC_CAP_ENCODER)) res = AOM_CODEC_INCAPABLE; @@ -158,9 +158,9 @@ for (i = 0; i < iface->enc.cfg_map_count; ++i) { map = iface->enc.cfg_maps + i; - if (map->usage == (int)usage) { + if (map->usage == (int)reserved) { *cfg = map->cfg; - cfg->g_usage = usage; + cfg->g_usage = reserved; res = AOM_CODEC_OK; break; }
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index aa90ab7..b654a9a 100755 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -350,7 +350,7 @@ # # Sub Pixel Filters # -add_proto qw/void aom_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h"; add_proto qw/void aom_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; add_proto qw/void aom_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; @@ -358,7 +358,7 @@ specialize qw/aom_convolve8_horiz sse2 ssse3/, "$avx2_ssse3"; specialize qw/aom_convolve8_vert sse2 ssse3/, "$avx2_ssse3"; -add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd"; +add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd"; specialize qw/aom_highbd_convolve_copy sse2 avx2/; add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd"; @@ -445,7 +445,7 @@ add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_horizontal_14 sse2/; -add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd"; +add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,int bd"; specialize qw/aom_highbd_lpf_horizontal_14_dual sse2 avx2/; add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; @@ -548,19 +548,19 @@ # # Alpha blending with mask # -add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params"; +add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params"; specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 avx2 neon/; -add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby"; +add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh"; add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h"; add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h"; specialize "aom_blend_a64_mask", qw/sse4_1 avx2/; specialize "aom_blend_a64_hmask", qw/sse4_1 neon/; specialize "aom_blend_a64_vmask", qw/sse4_1 neon/; -add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd"; +add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd"; add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd"; add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd"; -add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd"; +add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd"; specialize "aom_highbd_blend_a64_mask", qw/sse4_1/; specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/; specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
diff --git a/aom_dsp/loopfilter.c b/aom_dsp/loopfilter.c index a3f2618..40a02cc 100644 --- a/aom_dsp/loopfilter.c +++ b/aom_dsp/loopfilter.c
@@ -865,10 +865,10 @@ } } -void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { - highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd); +void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd); } void aom_highbd_lpf_horizontal_14_dual_c(
diff --git a/aom_dsp/x86/blend_a64_mask_avx2.c b/aom_dsp/x86/blend_a64_mask_avx2.c index 057f615..f38c43f 100644 --- a/aom_dsp/x86/blend_a64_mask_avx2.c +++ b/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -870,7 +870,7 @@ const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, - int h, int subx, int suby) { + int h, int subw, int subh) { assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); @@ -881,15 +881,15 @@ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, - mask, mask_stride, w, h, subx, suby); + mask, mask_stride, w, h, subw, subh); } else { - if (subx & suby) { + if (subw & subh) { blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); - } else if (subx) { + } else if (subw) { blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); - } else if (suby) { + } else if (subh) { blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); } else {
diff --git a/aom_dsp/x86/blend_a64_mask_sse4.c b/aom_dsp/x86/blend_a64_mask_sse4.c index b7a2468..22c304e 100644 --- a/aom_dsp/x86/blend_a64_mask_sse4.c +++ b/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -386,7 +386,7 @@ const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, - int h, int subx, int suby) { + int h, int subw, int subh) { typedef void (*blend_fn)( uint8_t * dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, @@ -415,9 +415,9 @@ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, - mask, mask_stride, w, h, subx, suby); + mask, mask_stride, w, h, subw, subh); } else { - blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0, + blend[(w >> 2) & 3][subw != 0][subh != 0](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); } @@ -819,13 +819,13 @@ const uint8_t *src1_8, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, - int subx, int suby, int bd) { + int subw, int subh, int bd) { typedef void (*blend_fn)( uint16_t * dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h); - // Dimensions are: bd_index X width_index X subx X suby + // Dimensions are: bd_index X width_index X subw X subh static const blend_fn blend[2][2][2][2] = { { // bd == 8 or 10 { // w % 8 == 0 @@ -858,14 +858,14 @@ assert(bd == 8 || bd == 10 || bd == 12); if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, - src1_stride, mask, mask_stride, w, h, subx, - suby, bd); + src1_stride, mask, mask_stride, w, h, subw, + subh, bd); } else { uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); - blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0]( + blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0]( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); }
diff --git a/aom_dsp/x86/highbd_loopfilter_sse2.c b/aom_dsp/x86/highbd_loopfilter_sse2.c index 70b91c6..b906db7 100644 --- a/aom_dsp/x86/highbd_loopfilter_sse2.c +++ b/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -497,8 +497,9 @@ } void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, - const uint8_t *blt, const uint8_t *lt, - const uint8_t *thr, int bd) { + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { __m128i p[7], q[7], pq[7]; int i; @@ -507,7 +508,7 @@ q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch)); } - highbd_lpf_internal_14_sse2(p, q, pq, blt, lt, thr, bd); + highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd); for (i = 0; i < 6; i++) { _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]);
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index e69d275..d1a38fa 100644 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -72,7 +72,7 @@ add_proto qw/void av1_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params"; -add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps"; +add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd"; specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/; specialize qw/av1_highbd_wiener_convolve_add_src ssse3/;
diff --git a/av1/common/quant_common.h b/av1/common/quant_common.h index d1f52a6..193237d 100644 --- a/av1/common/quant_common.h +++ b/av1/common/quant_common.h
@@ -51,9 +51,9 @@ return first + (qindex * (last + 1 - first)) / QINDEX_RANGE; } void av1_qm_init(struct AV1Common *cm); -const qm_val_t *av1_iqmatrix(struct AV1Common *cm, int qindex, int comp, +const qm_val_t *av1_iqmatrix(struct AV1Common *cm, int qmlevel, int plane, TX_SIZE tx_size); -const qm_val_t *av1_qmatrix(struct AV1Common *cm, int qindex, int comp, +const qm_val_t *av1_qmatrix(struct AV1Common *cm, int qmlevel, int plane, TX_SIZE tx_size); #ifdef __cplusplus
diff --git a/av1/common/x86/cfl_simd.h b/av1/common/x86/cfl_simd.h index c1ad182..f2d4ac7 100644 --- a/av1/common/x86/cfl_simd.h +++ b/av1/common/x86/cfl_simd.h
@@ -15,169 +15,169 @@ #include "av1/common/blockd.h" // SSSE3 version is optimal for with == 4, we reuse them in AVX2 -void cfl_subsample_lbd_420_4x4_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_420_4x4_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_420_4x8_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_420_4x8_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_420_4x16_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_420_4x16_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void cfl_subsample_lbd_420_8x4_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_420_8x4_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_420_8x8_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_420_8x8_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_420_8x16_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_420_8x16_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_420_8x32_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_420_8x32_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 16, we reuse it in AVX2 -void cfl_subsample_lbd_420_16x4_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_420_16x4_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_420_16x8_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_420_16x8_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_420_16x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void cfl_subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_420_16x16_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_lbd_420_16x32_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 4, we reuse them in AVX2 -void cfl_subsample_lbd_422_4x4_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_422_4x4_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_422_4x8_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_422_4x8_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_422_4x16_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_422_4x16_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void cfl_subsample_lbd_422_8x4_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_422_8x4_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_422_8x8_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_422_8x8_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_422_8x16_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_422_8x16_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_422_8x32_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_422_8x32_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 16, we reuse it in AVX2 -void cfl_subsample_lbd_422_16x4_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_422_16x4_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_422_16x8_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_422_16x8_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_422_16x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void cfl_subsample_lbd_422_16x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_422_16x16_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_lbd_422_16x32_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 4, we reuse them in AVX2 -void cfl_subsample_lbd_444_4x4_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_444_4x4_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_444_4x8_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_444_4x8_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_444_4x16_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_444_4x16_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void cfl_subsample_lbd_444_8x4_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_444_8x4_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_444_8x8_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_444_8x8_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_444_8x16_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_444_8x16_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_444_8x32_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_444_8x32_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 16, we reuse it in AVX2 -void cfl_subsample_lbd_444_16x4_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_444_16x4_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_444_16x8_ssse3(const uint8_t *input, int input_stride, +void cfl_subsample_lbd_444_16x8_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_lbd_444_16x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void cfl_subsample_lbd_444_16x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_444_16x16_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_lbd_444_16x32_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); -void cfl_subsample_hbd_420_4x4_ssse3(const uint16_t *input, int input_stride, +void cfl_subsample_hbd_420_4x4_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_hbd_420_4x8_ssse3(const uint16_t *input, int input_stride, +void cfl_subsample_hbd_420_4x8_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_hbd_420_4x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_420_4x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void cfl_subsample_hbd_420_8x4_ssse3(const uint16_t *input, int input_stride, +void cfl_subsample_hbd_420_8x4_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_hbd_420_8x8_ssse3(const uint16_t *input, int input_stride, +void cfl_subsample_hbd_420_8x8_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_hbd_420_8x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void cfl_subsample_hbd_420_8x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_420_8x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_8x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); // SSSE3 version is faster for with == 16, we reuse it in AVX2 -void cfl_subsample_hbd_420_16x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void cfl_subsample_hbd_420_16x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void cfl_subsample_hbd_420_16x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void cfl_subsample_hbd_420_16x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_420_16x4_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_16x8_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_16x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_16x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); -void cfl_subsample_hbd_422_4x4_ssse3(const uint16_t *input, int input_stride, +void cfl_subsample_hbd_422_4x4_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_hbd_422_4x8_ssse3(const uint16_t *input, int input_stride, +void cfl_subsample_hbd_422_4x8_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_hbd_422_4x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_422_4x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void cfl_subsample_hbd_422_8x4_ssse3(const uint16_t *input, int input_stride, +void cfl_subsample_hbd_422_8x4_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_hbd_422_8x8_ssse3(const uint16_t *input, int input_stride, +void cfl_subsample_hbd_422_8x8_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_hbd_422_8x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void cfl_subsample_hbd_422_8x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_422_8x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_8x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); // SSSE3 version is faster for with == 16, we reuse it in AVX2 -void cfl_subsample_hbd_422_16x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void cfl_subsample_hbd_422_16x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void cfl_subsample_hbd_422_16x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void cfl_subsample_hbd_422_16x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_422_16x4_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_16x8_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_16x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_16x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); -void cfl_subsample_hbd_444_4x4_ssse3(const uint16_t *input, int input_stride, +void cfl_subsample_hbd_444_4x4_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_hbd_444_4x8_ssse3(const uint16_t *input, int input_stride, +void cfl_subsample_hbd_444_4x8_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_hbd_444_4x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_444_4x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void cfl_subsample_hbd_444_8x4_ssse3(const uint16_t *input, int input_stride, +void cfl_subsample_hbd_444_8x4_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_hbd_444_8x8_ssse3(const uint16_t *input, int input_stride, +void cfl_subsample_hbd_444_8x8_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); -void cfl_subsample_hbd_444_8x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void cfl_subsample_hbd_444_8x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_444_8x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_8x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); // SSSE3 version is faster for with == 16, we reuse it in AVX2 -void cfl_subsample_hbd_444_16x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void cfl_subsample_hbd_444_16x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void cfl_subsample_hbd_444_16x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void cfl_subsample_hbd_444_16x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_444_16x4_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_16x8_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_16x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_16x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); // SSE2 version is optimal for with == 4, we reuse them in AVX2 void cfl_subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c index c6bf917..462d7b8 100644 --- a/av1/common/x86/highbd_inv_txfm_sse4.c +++ b/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -701,7 +701,7 @@ out[2] = _mm_unpacklo_epi64(v[1], v[3]); out[3] = _mm_unpackhi_epi64(v[1], v[3]); } -void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output, +void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { __m128i in[4]; const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4]; @@ -710,61 +710,61 @@ switch (tx_type) { case DCT_DCT: - load_buffer_4x4(coeff, in); + load_buffer_4x4(input, in); idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case ADST_DCT: - load_buffer_4x4(coeff, in); + load_buffer_4x4(input, in); idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case DCT_ADST: - load_buffer_4x4(coeff, in); + load_buffer_4x4(input, in); iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case ADST_ADST: - load_buffer_4x4(coeff, in); + load_buffer_4x4(input, in); iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case FLIPADST_DCT: - load_buffer_4x4(coeff, in); + load_buffer_4x4(input, in); idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); break; case DCT_FLIPADST: - load_buffer_4x4(coeff, in); + load_buffer_4x4(input, in); iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); break; case FLIPADST_FLIPADST: - load_buffer_4x4(coeff, in); + load_buffer_4x4(input, in); iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd); break; case ADST_FLIPADST: - load_buffer_4x4(coeff, in); + load_buffer_4x4(input, in); iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); break; case FLIPADST_ADST: - load_buffer_4x4(coeff, in); + load_buffer_4x4(input, in); iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); break; case IDTX: - load_buffer_4x4(coeff, in); + load_buffer_4x4(input, in); iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, @@ -772,42 +772,42 @@ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case V_DCT: - load_buffer_4x4(coeff, in); + load_buffer_4x4(input, in); iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case H_DCT: - load_buffer_4x4(coeff, in); + load_buffer_4x4(input, in); idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case V_ADST: - load_buffer_4x4(coeff, in); + load_buffer_4x4(input, in); iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case H_ADST: - load_buffer_4x4(coeff, in); + load_buffer_4x4(input, in); iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case V_FLIPADST: - load_buffer_4x4(coeff, in); + load_buffer_4x4(input, in); iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); break; case H_FLIPADST: - load_buffer_4x4(coeff, in); + load_buffer_4x4(input, in); iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); @@ -1415,7 +1415,7 @@ _mm_store_si128((__m128i *)(output + 7 * stride), u7); } -void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output, +void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { __m128i in[16], out[16]; const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8]; @@ -1424,7 +1424,7 @@ switch (tx_type) { case DCT_DCT: - load_buffer_8x8(coeff, in); + load_buffer_8x8(input, in); transpose_8x8(in, out); idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); @@ -1433,7 +1433,7 @@ write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd); break; case DCT_ADST: - load_buffer_8x8(coeff, in); + load_buffer_8x8(input, in); transpose_8x8(in, out); iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); @@ -1442,7 +1442,7 @@ write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd); break; case ADST_DCT: - load_buffer_8x8(coeff, in); + load_buffer_8x8(input, in); transpose_8x8(in, out); idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); @@ -1451,7 +1451,7 @@ write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd); break; case ADST_ADST: - load_buffer_8x8(coeff, in); + load_buffer_8x8(input, in); transpose_8x8(in, out); iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); @@ -1460,7 +1460,7 @@ write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd); break; case FLIPADST_DCT: - load_buffer_8x8(coeff, in); + load_buffer_8x8(input, in); transpose_8x8(in, out); idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); @@ -1469,7 +1469,7 @@ write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd); break; case DCT_FLIPADST: - load_buffer_8x8(coeff, in); + load_buffer_8x8(input, in); transpose_8x8(in, out); iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); @@ -1478,7 +1478,7 @@ write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd); break; case ADST_FLIPADST: - load_buffer_8x8(coeff, in); + load_buffer_8x8(input, in); transpose_8x8(in, out); iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); @@ -1487,7 +1487,7 @@ write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd); break; case FLIPADST_FLIPADST: - load_buffer_8x8(coeff, in); + load_buffer_8x8(input, in); transpose_8x8(in, out); iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); @@ -1496,7 +1496,7 @@ write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd); break; case FLIPADST_ADST: - load_buffer_8x8(coeff, in); + load_buffer_8x8(input, in); transpose_8x8(in, out); iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
diff --git a/av1/common/x86/reconinter_avx2.c b/av1/common/x86/reconinter_avx2.c index f645e04..a38bd83 100644 --- a/av1/common/x86/reconinter_avx2.c +++ b/av1/common/x86/reconinter_avx2.c
@@ -28,8 +28,8 @@ } void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, - const uint8_t *src0, int stride0, - const uint8_t *src1, int stride1, + const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, int h, int w) { const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0; const __m256i y_mask_base = _mm256_set1_epi16(38 - mb); @@ -37,18 +37,18 @@ if (4 == w) { do { const __m128i s0A = xx_loadl_32(src0); - const __m128i s0B = xx_loadl_32(src0 + stride0); - const __m128i s0C = xx_loadl_32(src0 + stride0 * 2); - const __m128i s0D = xx_loadl_32(src0 + stride0 * 3); + const __m128i s0B = xx_loadl_32(src0 + src0_stride); + const __m128i s0C = xx_loadl_32(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_32(src0 + src0_stride * 3); const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B); const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D); const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD); const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD); const __m128i s1A = xx_loadl_32(src1); - const __m128i s1B = xx_loadl_32(src1 + stride1); - const __m128i s1C = xx_loadl_32(src1 + stride1 * 2); - const __m128i s1D = xx_loadl_32(src1 + stride1 * 3); + const __m128i s1B = xx_loadl_32(src1 + src1_stride); + const __m128i s1C = xx_loadl_32(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_32(src1 + src1_stride * 3); const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B); const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D); const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD); @@ -58,40 +58,40 @@ const __m128i x_m8 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)); xx_storeu_128(mask, x_m8); - src0 += (stride0 << 2); - src1 += (stride1 << 2); + src0 += (src0_stride << 2); + src1 += (src1_stride << 2); mask += 16; i += 4; } while (i < h); } else if (8 == w) { do { const __m128i s0A = xx_loadl_64(src0); - const __m128i s0B = xx_loadl_64(src0 + stride0); - const __m128i s0C = xx_loadl_64(src0 + stride0 * 2); - const __m128i s0D = xx_loadl_64(src0 + stride0 * 3); + const __m128i s0B = xx_loadl_64(src0 + src0_stride); + const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C)); const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D)); const __m128i s1A = xx_loadl_64(src1); - const __m128i s1B = xx_loadl_64(src1 + stride1); - const __m128i s1C = xx_loadl_64(src1 + stride1 * 2); - const __m128i s1D = xx_loadl_64(src1 + stride1 * 3); + const __m128i s1B = xx_loadl_64(src1 + src1_stride); + const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C)); const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D)); const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w); const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w); const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD); yy_storeu_256(mask, m8); - src0 += stride0 << 2; - src1 += stride1 << 2; + src0 += src0_stride << 2; + src1 += src1_stride << 2; mask += 32; i += 4; } while (i < h); } else if (16 == w) { do { const __m128i s0A = xx_load_128(src0); - const __m128i s0B = xx_load_128(src0 + stride0); + const __m128i s0B = xx_load_128(src0 + src0_stride); const __m128i s1A = xx_load_128(src1); - const __m128i s1B = xx_load_128(src1 + stride1); + const __m128i s1B = xx_load_128(src1 + src1_stride); const __m256i s0AL = _mm256_cvtepu8_epi16(s0A); const __m256i s0BL = _mm256_cvtepu8_epi16(s0B); const __m256i s1AL = _mm256_cvtepu8_epi16(s1A); @@ -103,8 +103,8 @@ const __m256i m8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8); yy_storeu_256(mask, m8); - src0 += stride0 << 1; - src1 += stride1 << 1; + src0 += src0_stride << 1; + src1 += src1_stride << 1; mask += 32; i += 2; } while (i < h); @@ -127,8 +127,8 @@ yy_storeu_256(mask + j, m8); j += 32; } while (j < w); - src0 += stride0; - src1 += stride1; + src0 += src0_stride; + src1 += src1_stride; mask += w; i += 1; } while (i < h);