Make parameter names consistent
BUG=aomedia:2228
Change-Id: If701f16aec272e1df43174fb39d34c5a0f69babb
diff --git a/aom/src/aom_encoder.c b/aom/src/aom_encoder.c
index 01917c9..7270797 100644
--- a/aom/src/aom_encoder.c
+++ b/aom/src/aom_encoder.c
@@ -144,12 +144,12 @@
aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface,
aom_codec_enc_cfg_t *cfg,
- unsigned int usage) {
+ unsigned int reserved) {
aom_codec_err_t res;
aom_codec_enc_cfg_map_t *map;
int i;
- if (!iface || !cfg || usage > INT_MAX)
+ if (!iface || !cfg || reserved > INT_MAX)
res = AOM_CODEC_INVALID_PARAM;
else if (!(iface->caps & AOM_CODEC_CAP_ENCODER))
res = AOM_CODEC_INCAPABLE;
@@ -158,9 +158,9 @@
for (i = 0; i < iface->enc.cfg_map_count; ++i) {
map = iface->enc.cfg_maps + i;
- if (map->usage == (int)usage) {
+ if (map->usage == (int)reserved) {
*cfg = map->cfg;
- cfg->g_usage = usage;
+ cfg->g_usage = reserved;
res = AOM_CODEC_OK;
break;
}
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index aa90ab7..b654a9a 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -350,7 +350,7 @@
#
# Sub Pixel Filters
#
-add_proto qw/void aom_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h";
add_proto qw/void aom_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void aom_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
@@ -358,7 +358,7 @@
specialize qw/aom_convolve8_horiz sse2 ssse3/, "$avx2_ssse3";
specialize qw/aom_convolve8_vert sse2 ssse3/, "$avx2_ssse3";
-add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
+add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd";
specialize qw/aom_highbd_convolve_copy sse2 avx2/;
add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
@@ -445,7 +445,7 @@
add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
specialize qw/aom_highbd_lpf_horizontal_14 sse2/;
-add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd";
+add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,int bd";
specialize qw/aom_highbd_lpf_horizontal_14_dual sse2 avx2/;
add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
@@ -548,19 +548,19 @@
#
# Alpha blending with mask
#
-add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params";
+add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params";
specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 avx2 neon/;
-add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby";
+add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh";
add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
specialize "aom_blend_a64_mask", qw/sse4_1 avx2/;
specialize "aom_blend_a64_hmask", qw/sse4_1 neon/;
specialize "aom_blend_a64_vmask", qw/sse4_1 neon/;
-add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd";
+add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd";
add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
-add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd";
+add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd";
specialize "aom_highbd_blend_a64_mask", qw/sse4_1/;
specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/;
specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
diff --git a/aom_dsp/loopfilter.c b/aom_dsp/loopfilter.c
index a3f2618..40a02cc 100644
--- a/aom_dsp/loopfilter.c
+++ b/aom_dsp/loopfilter.c
@@ -865,10 +865,10 @@
}
}
-void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
+void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd);
}
void aom_highbd_lpf_horizontal_14_dual_c(
diff --git a/aom_dsp/x86/blend_a64_mask_avx2.c b/aom_dsp/x86/blend_a64_mask_avx2.c
index 057f615..f38c43f 100644
--- a/aom_dsp/x86/blend_a64_mask_avx2.c
+++ b/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -870,7 +870,7 @@
const uint8_t *src0, uint32_t src0_stride,
const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w,
- int h, int subx, int suby) {
+ int h, int subw, int subh) {
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
@@ -881,15 +881,15 @@
if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
- mask, mask_stride, w, h, subx, suby);
+ mask, mask_stride, w, h, subw, subh);
} else {
- if (subx & suby) {
+ if (subw & subh) {
blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
src1_stride, mask, mask_stride, w, h);
- } else if (subx) {
+ } else if (subw) {
blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
src1_stride, mask, mask_stride, w, h);
- } else if (suby) {
+ } else if (subh) {
blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
src1_stride, mask, mask_stride, w, h);
} else {
diff --git a/aom_dsp/x86/blend_a64_mask_sse4.c b/aom_dsp/x86/blend_a64_mask_sse4.c
index b7a2468..22c304e 100644
--- a/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -386,7 +386,7 @@
const uint8_t *src0, uint32_t src0_stride,
const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w,
- int h, int subx, int suby) {
+ int h, int subw, int subh) {
typedef void (*blend_fn)(
uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
@@ -415,9 +415,9 @@
if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
- mask, mask_stride, w, h, subx, suby);
+ mask, mask_stride, w, h, subw, subh);
} else {
- blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0,
+ blend[(w >> 2) & 3][subw != 0][subh != 0](dst, dst_stride, src0,
src0_stride, src1, src1_stride,
mask, mask_stride, w, h);
}
@@ -819,13 +819,13 @@
const uint8_t *src1_8,
uint32_t src1_stride, const uint8_t *mask,
uint32_t mask_stride, int w, int h,
- int subx, int suby, int bd) {
+ int subw, int subh, int bd) {
typedef void (*blend_fn)(
uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h);
- // Dimensions are: bd_index X width_index X subx X suby
+ // Dimensions are: bd_index X width_index X subw X subh
static const blend_fn blend[2][2][2][2] = {
{ // bd == 8 or 10
{ // w % 8 == 0
@@ -858,14 +858,14 @@
assert(bd == 8 || bd == 10 || bd == 12);
if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
- src1_stride, mask, mask_stride, w, h, subx,
- suby, bd);
+ src1_stride, mask, mask_stride, w, h, subw,
+ subh, bd);
} else {
uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
- blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](
+ blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0](
dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
mask_stride, w, h);
}
diff --git a/aom_dsp/x86/highbd_loopfilter_sse2.c b/aom_dsp/x86/highbd_loopfilter_sse2.c
index 70b91c6..b906db7 100644
--- a/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -497,8 +497,9 @@
}
void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch,
- const uint8_t *blt, const uint8_t *lt,
- const uint8_t *thr, int bd) {
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
__m128i p[7], q[7], pq[7];
int i;
@@ -507,7 +508,7 @@
q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
}
- highbd_lpf_internal_14_sse2(p, q, pq, blt, lt, thr, bd);
+ highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
for (i = 0; i < 6; i++) {
_mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]);
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index e69d275..d1a38fa 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -72,7 +72,7 @@
add_proto qw/void av1_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params";
-add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps";
+add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd";
specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
specialize qw/av1_highbd_wiener_convolve_add_src ssse3/;
diff --git a/av1/common/quant_common.h b/av1/common/quant_common.h
index d1f52a6..193237d 100644
--- a/av1/common/quant_common.h
+++ b/av1/common/quant_common.h
@@ -51,9 +51,9 @@
return first + (qindex * (last + 1 - first)) / QINDEX_RANGE;
}
void av1_qm_init(struct AV1Common *cm);
-const qm_val_t *av1_iqmatrix(struct AV1Common *cm, int qindex, int comp,
+const qm_val_t *av1_iqmatrix(struct AV1Common *cm, int qmlevel, int plane,
TX_SIZE tx_size);
-const qm_val_t *av1_qmatrix(struct AV1Common *cm, int qindex, int comp,
+const qm_val_t *av1_qmatrix(struct AV1Common *cm, int qmlevel, int plane,
TX_SIZE tx_size);
#ifdef __cplusplus
diff --git a/av1/common/x86/cfl_simd.h b/av1/common/x86/cfl_simd.h
index c1ad182..f2d4ac7 100644
--- a/av1/common/x86/cfl_simd.h
+++ b/av1/common/x86/cfl_simd.h
@@ -15,169 +15,169 @@
#include "av1/common/blockd.h"
// SSSE3 version is optimal for with == 4, we reuse them in AVX2
-void cfl_subsample_lbd_420_4x4_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_420_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_420_4x8_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_420_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_420_4x16_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_420_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
// SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void cfl_subsample_lbd_420_8x4_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_420_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_420_8x8_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_420_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_420_8x16_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_420_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_420_8x32_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_420_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
// SSSE3 version is optimal for with == 16, we reuse it in AVX2
-void cfl_subsample_lbd_420_16x4_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_420_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_420_16x8_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_420_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_420_16x16_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void cfl_subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x16_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x32_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
// SSSE3 version is optimal for with == 4, we reuse them in AVX2
-void cfl_subsample_lbd_422_4x4_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_422_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_422_4x8_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_422_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_422_4x16_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_422_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
// SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void cfl_subsample_lbd_422_8x4_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_422_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_422_8x8_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_422_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_422_8x16_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_422_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_422_8x32_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_422_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
// SSSE3 version is optimal for with == 16, we reuse it in AVX2
-void cfl_subsample_lbd_422_16x4_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_422_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_422_16x8_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_422_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_422_16x16_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void cfl_subsample_lbd_422_16x32_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x16_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x32_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
// SSSE3 version is optimal for with == 4, we reuse them in AVX2
-void cfl_subsample_lbd_444_4x4_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_444_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_444_4x8_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_444_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_444_4x16_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_444_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
// SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void cfl_subsample_lbd_444_8x4_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_444_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_444_8x8_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_444_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_444_8x16_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_444_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_444_8x32_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_444_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
// SSSE3 version is optimal for with == 16, we reuse it in AVX2
-void cfl_subsample_lbd_444_16x4_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_444_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_444_16x8_ssse3(const uint8_t *input, int input_stride,
+void cfl_subsample_lbd_444_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_lbd_444_16x16_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
-void cfl_subsample_lbd_444_16x32_ssse3(const uint8_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x16_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x32_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
-void cfl_subsample_hbd_420_4x4_ssse3(const uint16_t *input, int input_stride,
+void cfl_subsample_hbd_420_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_hbd_420_4x8_ssse3(const uint16_t *input, int input_stride,
+void cfl_subsample_hbd_420_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_hbd_420_4x16_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_hbd_420_4x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
// SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void cfl_subsample_hbd_420_8x4_ssse3(const uint16_t *input, int input_stride,
+void cfl_subsample_hbd_420_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_hbd_420_8x8_ssse3(const uint16_t *input, int input_stride,
+void cfl_subsample_hbd_420_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_hbd_420_8x16_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void cfl_subsample_hbd_420_8x32_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
// SSSE3 version is faster for with == 16, we reuse it in AVX2
-void cfl_subsample_hbd_420_16x4_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void cfl_subsample_hbd_420_16x8_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void cfl_subsample_hbd_420_16x16_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void cfl_subsample_hbd_420_16x32_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x4_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x8_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
-void cfl_subsample_hbd_422_4x4_ssse3(const uint16_t *input, int input_stride,
+void cfl_subsample_hbd_422_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_hbd_422_4x8_ssse3(const uint16_t *input, int input_stride,
+void cfl_subsample_hbd_422_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_hbd_422_4x16_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_hbd_422_4x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
// SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void cfl_subsample_hbd_422_8x4_ssse3(const uint16_t *input, int input_stride,
+void cfl_subsample_hbd_422_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_hbd_422_8x8_ssse3(const uint16_t *input, int input_stride,
+void cfl_subsample_hbd_422_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_hbd_422_8x16_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void cfl_subsample_hbd_422_8x32_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
// SSSE3 version is faster for with == 16, we reuse it in AVX2
-void cfl_subsample_hbd_422_16x4_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void cfl_subsample_hbd_422_16x8_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void cfl_subsample_hbd_422_16x16_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void cfl_subsample_hbd_422_16x32_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x4_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x8_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
-void cfl_subsample_hbd_444_4x4_ssse3(const uint16_t *input, int input_stride,
+void cfl_subsample_hbd_444_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_hbd_444_4x8_ssse3(const uint16_t *input, int input_stride,
+void cfl_subsample_hbd_444_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_hbd_444_4x16_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_hbd_444_4x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
// SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void cfl_subsample_hbd_444_8x4_ssse3(const uint16_t *input, int input_stride,
+void cfl_subsample_hbd_444_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_hbd_444_8x8_ssse3(const uint16_t *input, int input_stride,
+void cfl_subsample_hbd_444_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
uint16_t *output_q3);
-void cfl_subsample_hbd_444_8x16_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void cfl_subsample_hbd_444_8x32_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
// SSSE3 version is faster for with == 16, we reuse it in AVX2
-void cfl_subsample_hbd_444_16x4_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void cfl_subsample_hbd_444_16x8_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void cfl_subsample_hbd_444_16x16_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
-void cfl_subsample_hbd_444_16x32_ssse3(const uint16_t *input, int input_stride,
- uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x4_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x8_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
// SSE2 version is optimal for with == 4, we reuse them in AVX2
void cfl_subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c
index c6bf917..462d7b8 100644
--- a/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -701,7 +701,7 @@
out[2] = _mm_unpacklo_epi64(v[1], v[3]);
out[3] = _mm_unpackhi_epi64(v[1], v[3]);
}
-void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
__m128i in[4];
const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4];
@@ -710,61 +710,61 @@
switch (tx_type) {
case DCT_DCT:
- load_buffer_4x4(coeff, in);
+ load_buffer_4x4(input, in);
idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_DCT:
- load_buffer_4x4(coeff, in);
+ load_buffer_4x4(input, in);
idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case DCT_ADST:
- load_buffer_4x4(coeff, in);
+ load_buffer_4x4(input, in);
iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_ADST:
- load_buffer_4x4(coeff, in);
+ load_buffer_4x4(input, in);
iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case FLIPADST_DCT:
- load_buffer_4x4(coeff, in);
+ load_buffer_4x4(input, in);
idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
break;
case DCT_FLIPADST:
- load_buffer_4x4(coeff, in);
+ load_buffer_4x4(input, in);
iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_FLIPADST:
- load_buffer_4x4(coeff, in);
+ load_buffer_4x4(input, in);
iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
break;
case ADST_FLIPADST:
- load_buffer_4x4(coeff, in);
+ load_buffer_4x4(input, in);
iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_ADST:
- load_buffer_4x4(coeff, in);
+ load_buffer_4x4(input, in);
iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
break;
case IDTX:
- load_buffer_4x4(coeff, in);
+ load_buffer_4x4(input, in);
iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
0);
iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
@@ -772,42 +772,42 @@
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case V_DCT:
- load_buffer_4x4(coeff, in);
+ load_buffer_4x4(input, in);
iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
0);
idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case H_DCT:
- load_buffer_4x4(coeff, in);
+ load_buffer_4x4(input, in);
idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
0);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case V_ADST:
- load_buffer_4x4(coeff, in);
+ load_buffer_4x4(input, in);
iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
0);
iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case H_ADST:
- load_buffer_4x4(coeff, in);
+ load_buffer_4x4(input, in);
iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
0);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case V_FLIPADST:
- load_buffer_4x4(coeff, in);
+ load_buffer_4x4(input, in);
iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
0);
iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
break;
case H_FLIPADST:
- load_buffer_4x4(coeff, in);
+ load_buffer_4x4(input, in);
iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
0);
@@ -1415,7 +1415,7 @@
_mm_store_si128((__m128i *)(output + 7 * stride), u7);
}
-void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
__m128i in[16], out[16];
const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8];
@@ -1424,7 +1424,7 @@
switch (tx_type) {
case DCT_DCT:
- load_buffer_8x8(coeff, in);
+ load_buffer_8x8(input, in);
transpose_8x8(in, out);
idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-shift[0]);
@@ -1433,7 +1433,7 @@
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
case DCT_ADST:
- load_buffer_8x8(coeff, in);
+ load_buffer_8x8(input, in);
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-shift[0]);
@@ -1442,7 +1442,7 @@
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_DCT:
- load_buffer_8x8(coeff, in);
+ load_buffer_8x8(input, in);
transpose_8x8(in, out);
idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-shift[0]);
@@ -1451,7 +1451,7 @@
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_ADST:
- load_buffer_8x8(coeff, in);
+ load_buffer_8x8(input, in);
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-shift[0]);
@@ -1460,7 +1460,7 @@
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
case FLIPADST_DCT:
- load_buffer_8x8(coeff, in);
+ load_buffer_8x8(input, in);
transpose_8x8(in, out);
idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-shift[0]);
@@ -1469,7 +1469,7 @@
write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
break;
case DCT_FLIPADST:
- load_buffer_8x8(coeff, in);
+ load_buffer_8x8(input, in);
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-shift[0]);
@@ -1478,7 +1478,7 @@
write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
break;
case ADST_FLIPADST:
- load_buffer_8x8(coeff, in);
+ load_buffer_8x8(input, in);
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-shift[0]);
@@ -1487,7 +1487,7 @@
write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_FLIPADST:
- load_buffer_8x8(coeff, in);
+ load_buffer_8x8(input, in);
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-shift[0]);
@@ -1496,7 +1496,7 @@
write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
break;
case FLIPADST_ADST:
- load_buffer_8x8(coeff, in);
+ load_buffer_8x8(input, in);
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
-shift[0]);
diff --git a/av1/common/x86/reconinter_avx2.c b/av1/common/x86/reconinter_avx2.c
index f645e04..a38bd83 100644
--- a/av1/common/x86/reconinter_avx2.c
+++ b/av1/common/x86/reconinter_avx2.c
@@ -28,8 +28,8 @@
}
void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
DIFFWTD_MASK_TYPE mask_type,
- const uint8_t *src0, int stride0,
- const uint8_t *src1, int stride1,
+ const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
int h, int w) {
const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
const __m256i y_mask_base = _mm256_set1_epi16(38 - mb);
@@ -37,18 +37,18 @@
if (4 == w) {
do {
const __m128i s0A = xx_loadl_32(src0);
- const __m128i s0B = xx_loadl_32(src0 + stride0);
- const __m128i s0C = xx_loadl_32(src0 + stride0 * 2);
- const __m128i s0D = xx_loadl_32(src0 + stride0 * 3);
+ const __m128i s0B = xx_loadl_32(src0 + src0_stride);
+ const __m128i s0C = xx_loadl_32(src0 + src0_stride * 2);
+ const __m128i s0D = xx_loadl_32(src0 + src0_stride * 3);
const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D);
const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD);
const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD);
const __m128i s1A = xx_loadl_32(src1);
- const __m128i s1B = xx_loadl_32(src1 + stride1);
- const __m128i s1C = xx_loadl_32(src1 + stride1 * 2);
- const __m128i s1D = xx_loadl_32(src1 + stride1 * 3);
+ const __m128i s1B = xx_loadl_32(src1 + src1_stride);
+ const __m128i s1C = xx_loadl_32(src1 + src1_stride * 2);
+ const __m128i s1D = xx_loadl_32(src1 + src1_stride * 3);
const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D);
const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD);
@@ -58,40 +58,40 @@
const __m128i x_m8 =
_mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8));
xx_storeu_128(mask, x_m8);
- src0 += (stride0 << 2);
- src1 += (stride1 << 2);
+ src0 += (src0_stride << 2);
+ src1 += (src1_stride << 2);
mask += 16;
i += 4;
} while (i < h);
} else if (8 == w) {
do {
const __m128i s0A = xx_loadl_64(src0);
- const __m128i s0B = xx_loadl_64(src0 + stride0);
- const __m128i s0C = xx_loadl_64(src0 + stride0 * 2);
- const __m128i s0D = xx_loadl_64(src0 + stride0 * 3);
+ const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+ const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+ const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C));
const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D));
const __m128i s1A = xx_loadl_64(src1);
- const __m128i s1B = xx_loadl_64(src1 + stride1);
- const __m128i s1C = xx_loadl_64(src1 + stride1 * 2);
- const __m128i s1D = xx_loadl_64(src1 + stride1 * 3);
+ const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+ const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+ const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C));
const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D));
const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w);
const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w);
const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD);
yy_storeu_256(mask, m8);
- src0 += stride0 << 2;
- src1 += stride1 << 2;
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
mask += 32;
i += 4;
} while (i < h);
} else if (16 == w) {
do {
const __m128i s0A = xx_load_128(src0);
- const __m128i s0B = xx_load_128(src0 + stride0);
+ const __m128i s0B = xx_load_128(src0 + src0_stride);
const __m128i s1A = xx_load_128(src1);
- const __m128i s1B = xx_load_128(src1 + stride1);
+ const __m128i s1B = xx_load_128(src1 + src1_stride);
const __m256i s0AL = _mm256_cvtepu8_epi16(s0A);
const __m256i s0BL = _mm256_cvtepu8_epi16(s0B);
const __m256i s1AL = _mm256_cvtepu8_epi16(s1A);
@@ -103,8 +103,8 @@
const __m256i m8 =
_mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8);
yy_storeu_256(mask, m8);
- src0 += stride0 << 1;
- src1 += stride1 << 1;
+ src0 += src0_stride << 1;
+ src1 += src1_stride << 1;
mask += 32;
i += 2;
} while (i < h);
@@ -127,8 +127,8 @@
yy_storeu_256(mask + j, m8);
j += 32;
} while (j < w);
- src0 += stride0;
- src1 += stride1;
+ src0 += src0_stride;
+ src1 += src1_stride;
mask += w;
i += 1;
} while (i < h);