x86,cosmetics: prefer _mm_setzero_si128/_mm256_setzero_si256
over *_set1_*(0)
Change-Id: I136e1798a2ce286480ebb9418db67a2f1e92b9a2
diff --git a/aom_dsp/x86/blend_a64_mask_avx2.c b/aom_dsp/x86/blend_a64_mask_avx2.c
index 09c63a3..dfbab32 100644
--- a/aom_dsp/x86/blend_a64_mask_avx2.c
+++ b/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -1307,7 +1307,7 @@
const __m256i v_round_offset = _mm256_set1_epi32(round_offset);
const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
- const __m256i clip_low = _mm256_set1_epi16(0);
+ const __m256i clip_low = _mm256_setzero_si256();
const __m256i clip_high = _mm256_set1_epi16((1 << bd) - 1);
const __m256i mask_max = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
diff --git a/aom_dsp/x86/blend_a64_mask_sse4.c b/aom_dsp/x86/blend_a64_mask_sse4.c
index 270ad26..58a7345 100644
--- a/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -1493,7 +1493,7 @@
const __m128i v_round_offset = _mm_set1_epi32(round_offset);
const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
- const __m128i clip_low = _mm_set1_epi16(0);
+ const __m128i clip_low = _mm_setzero_si128();
const __m128i clip_high = _mm_set1_epi16((1 << bd) - 1);
const __m128i mask_max = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
diff --git a/aom_dsp/x86/loopfilter_avx2.c b/aom_dsp/x86/loopfilter_avx2.c
index b593819..af6c5da 100644
--- a/aom_dsp/x86/loopfilter_avx2.c
+++ b/aom_dsp/x86/loopfilter_avx2.c
@@ -32,7 +32,7 @@
_mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
const __m128i blimit_v =
_mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i ff = _mm_cmpeq_epi8(zero, zero);
p256_2 =
@@ -239,7 +239,7 @@
_mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
const __m128i blimit_v =
_mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i ff = _mm_cmpeq_epi8(zero, zero);
p256_3 =
@@ -486,7 +486,7 @@
_mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
const __m128i blimit_v =
_mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i ff = _mm_cmpeq_epi8(zero, zero);
p256_3 =
diff --git a/aom_dsp/x86/loopfilter_sse2.c b/aom_dsp/x86/loopfilter_sse2.c
index 87c5bb3..731dd10 100644
--- a/aom_dsp/x86/loopfilter_sse2.c
+++ b/aom_dsp/x86/loopfilter_sse2.c
@@ -2133,7 +2133,7 @@
const unsigned char *_blimit0,
const unsigned char *_limit0,
const unsigned char *_thresh0) {
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi8(1);
const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
@@ -2438,7 +2438,7 @@
const unsigned char *_blimit0,
const unsigned char *_limit0,
const unsigned char *_thresh0) {
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi8(1);
const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
@@ -2630,7 +2630,7 @@
const unsigned char *_blimit0,
const unsigned char *_limit0,
const unsigned char *_thresh0) {
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi8(1);
const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
@@ -2802,7 +2802,7 @@
const unsigned char *_blimit0,
const unsigned char *_limit0,
const unsigned char *_thresh0) {
- const __m128i zero = _mm_set1_epi16(0);
+ const __m128i zero = _mm_setzero_si128();
const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
diff --git a/aom_dsp/x86/quantize_avx2.c b/aom_dsp/x86/quantize_avx2.c
index 5763bd6..b808d46 100644
--- a/aom_dsp/x86/quantize_avx2.c
+++ b/aom_dsp/x86/quantize_avx2.c
@@ -128,7 +128,7 @@
const int16_t *iscan) {
(void)scan;
__m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
- __m256i v_eobmax = _mm256_set1_epi16(0);
+ __m256i v_eobmax = _mm256_setzero_si256();
load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
&v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
@@ -211,7 +211,7 @@
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *iscan, int log_scale) {
__m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
- __m256i v_eobmax = _mm256_set1_epi16(0);
+ __m256i v_eobmax = _mm256_setzero_si256();
load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
&v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
diff --git a/aom_dsp/x86/variance_impl_avx2.c b/aom_dsp/x86/variance_impl_avx2.c
index 8f11ada..8ea0443 100644
--- a/aom_dsp/x86/variance_impl_avx2.c
+++ b/aom_dsp/x86/variance_impl_avx2.c
@@ -171,9 +171,9 @@
__m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
__m256i zero_reg;
int i, sum;
- sum_reg = _mm256_set1_epi16(0);
- sse_reg = _mm256_set1_epi16(0);
- zero_reg = _mm256_set1_epi16(0);
+ sum_reg = _mm256_setzero_si256();
+ sse_reg = _mm256_setzero_si256();
+ zero_reg = _mm256_setzero_si256();
// x_offset = 0 and y_offset = 0
if (x_offset == 0) {
@@ -359,9 +359,9 @@
__m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
__m256i zero_reg;
int i, sum;
- sum_reg = _mm256_set1_epi16(0);
- sse_reg = _mm256_set1_epi16(0);
- zero_reg = _mm256_set1_epi16(0);
+ sum_reg = _mm256_setzero_si256();
+ sse_reg = _mm256_setzero_si256();
+ zero_reg = _mm256_setzero_si256();
// x_offset = 0 and y_offset = 0
if (x_offset == 0) {
@@ -599,9 +599,9 @@
__m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
__m256i zero_reg;
int i, sum;
- sum_reg = _mm256_set1_epi16(0);
- sse_reg = _mm256_set1_epi16(0);
- zero_reg = _mm256_set1_epi16(0);
+ sum_reg = _mm256_setzero_si256();
+ sse_reg = _mm256_setzero_si256();
+ zero_reg = _mm256_setzero_si256();
// x_offset = 0 and y_offset = 0
if (x_offset == 0) {
diff --git a/av1/common/x86/highbd_convolve_2d_avx2.c b/av1/common/x86/highbd_convolve_2d_avx2.c
index 12046e4..429f8f1 100644
--- a/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -80,7 +80,7 @@
for (i = 0; i < im_h; i += 2) {
const __m256i row0 =
_mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
- __m256i row1 = _mm256_set1_epi16(0);
+ __m256i row1 = _mm256_setzero_si256();
if (i + 1 < im_h)
row1 =
_mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c
index 37f8f42..de3af3a 100644
--- a/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -517,7 +517,7 @@
static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
int bd, int out_shift) {
const int32_t *sinpi = sinpi_arr(bit);
- const __m128i zero = _mm_set1_epi32(0);
+ const __m128i zero = _mm_setzero_si128();
__m128i rnding = _mm_set1_epi32(1 << (bit + 4 - 1));
rnding = _mm_unpacklo_epi32(rnding, zero);
const __m128i mul = _mm_set1_epi32(1 << 4);
@@ -698,7 +698,7 @@
int bd, int out_shift) {
(void)bit;
__m128i v[4];
- __m128i zero = _mm_set1_epi32(0);
+ __m128i zero = _mm_setzero_si128();
__m128i fact = _mm_set1_epi32(NewSqrt2);
__m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
__m128i a0_low, a1_low;
@@ -3142,7 +3142,7 @@
__m128i fact = _mm_set1_epi32(2 * NewSqrt2);
__m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
__m128i a0_low, a0_high, a1_low, a1_high;
- __m128i zero = _mm_set1_epi32(0);
+ __m128i zero = _mm_setzero_si128();
offset = _mm_unpacklo_epi32(offset, zero);
for (int i = 0; i < 16; i++) {
diff --git a/av1/common/x86/highbd_jnt_convolve_avx2.c b/av1/common/x86/highbd_jnt_convolve_avx2.c
index 9cedd44..da52ecd 100644
--- a/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -282,7 +282,7 @@
for (i = 0; i < im_h; i += 2) {
const __m256i row0 =
_mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
- __m256i row1 = _mm256_set1_epi16(0);
+ __m256i row1 = _mm256_setzero_si256();
if (i + 1 < im_h)
row1 =
_mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
diff --git a/av1/common/x86/highbd_warp_affine_avx2.c b/av1/common/x86/highbd_warp_affine_avx2.c
index 87b1a66..7f6aceb 100644
--- a/av1/common/x86/highbd_warp_affine_avx2.c
+++ b/av1/common/x86/highbd_warp_affine_avx2.c
@@ -158,7 +158,7 @@
iy = iy * stride;
__m256i v_refl = _mm256_inserti128_si256(
- _mm256_set1_epi16(0),
+ _mm256_setzero_si256(),
_mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
v_refl = _mm256_inserti128_si256(
v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
@@ -218,7 +218,7 @@
_mm_shuffle_epi32(v_01, 3)); // A7A6A7A6A7A6A7A6
__m256i v_refl = _mm256_inserti128_si256(
- _mm256_set1_epi16(0),
+ _mm256_setzero_si256(),
_mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
v_refl = _mm256_inserti128_si256(
v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
@@ -333,7 +333,7 @@
iy = iy * stride;
__m256i v_refl = _mm256_inserti128_si256(
- _mm256_set1_epi16(0),
+ _mm256_setzero_si256(),
_mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
v_refl = _mm256_inserti128_si256(
v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
@@ -454,7 +454,7 @@
_mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6
__m256i v_refl = _mm256_inserti128_si256(
- _mm256_set1_epi16(0),
+ _mm256_setzero_si256(),
_mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
v_refl = _mm256_inserti128_si256(
v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
diff --git a/av1/common/x86/warp_plane_avx2.c b/av1/common/x86/warp_plane_avx2.c
index b0c9a93..ceb836e 100644
--- a/av1/common/x86/warp_plane_avx2.c
+++ b/av1/common/x86/warp_plane_avx2.c
@@ -1028,12 +1028,12 @@
int64_t sum_error = 0;
int i, j;
__m256i row_error, col_error;
- __m256i zero = _mm256_set1_epi16(0);
+ __m256i zero = _mm256_setzero_si256();
__m256i dup_255 = _mm256_set1_epi16(255);
col_error = zero;
for (i = 0; i < (p_height / 4); i++) {
- row_error = _mm256_set1_epi16(0);
+ row_error = _mm256_setzero_si256();
for (j = 0; j < (p_width / 16); j++) {
__m256i ref_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
(__m128i *)(ref + (j * 16) + (((i * 4) + 0) * ref_stride))));
diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c
index 6ff6665..f8fe578 100644
--- a/av1/common/x86/warp_plane_sse2.c
+++ b/av1/common/x86/warp_plane_sse2.c
@@ -21,7 +21,7 @@
int64_t sum_error = 0;
int i, j;
__m128i row_error, col_error;
- __m128i zero = _mm_set1_epi16(0);
+ __m128i zero = _mm_setzero_si128();
__m128i dup_255 = _mm_set1_epi16(255);
col_error = zero;
for (i = 0; i < (p_height); i++) {
diff --git a/av1/encoder/x86/av1_fwd_txfm_sse2.c b/av1/encoder/x86/av1_fwd_txfm_sse2.c
index 694e613..a5cbe16 100644
--- a/av1/encoder/x86/av1_fwd_txfm_sse2.c
+++ b/av1/encoder/x86/av1_fwd_txfm_sse2.c
@@ -1425,7 +1425,7 @@
const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
- const __m128i __zero = _mm_set1_epi16(0);
+ const __m128i __zero = _mm_setzero_si128();
const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
const __m128i in7 = _mm_add_epi16(input[0], input[1]);
__m128i u[8], v[8];
@@ -1573,7 +1573,7 @@
const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
- const __m128i __zero = _mm_set1_epi16(0);
+ const __m128i __zero = _mm_setzero_si128();
const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
const __m128i in7 = _mm_add_epi16(input[0], input[1]);
__m128i u_lo[8], u_hi[8], v_lo[8], v_hi[8];