[x86]: aom_smooth_h_predictor_{16,32,64} cosmetics.
Change-Id: I19b3e287ca16c85e9bffaef2ab941905214f78a7
diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c
index 9af5e78..5482ef5 100644
--- a/aom_dsp/x86/intrapred_ssse3.c
+++ b/aom_dsp/x86/intrapred_ssse3.c
@@ -1642,14 +1642,13 @@
}
void aom_smooth_h_predictor_16x4_ssse3(
- uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
const uint8_t *LIBAOM_RESTRICT top_row,
const uint8_t *LIBAOM_RESTRICT left_column) {
- const uint8_t *const top = (const uint8_t *)top_row;
- const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i top_right = _mm_set1_epi16(top_row[15]);
const __m128i left = cvtepu8_epi16(Load4(left_column));
const __m128i weights = LoadUnaligned16(smooth_weights + 12);
- __m128i scale = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
const __m128i weights1 = cvtepu8_epi16(weights);
const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
@@ -1658,38 +1657,36 @@
_mm_mullo_epi16(inverted_weights1, top_right);
const __m128i scaled_top_right2 =
_mm_mullo_epi16(inverted_weights2, top_right);
- scale = _mm_set1_epi16((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
__m128i y_mask = _mm_set1_epi32(0x01000100);
__m128i left_y = _mm_shuffle_epi8(left, y_mask);
- uint8_t *dst = (uint8_t *)dest;
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
dst += stride;
y_mask = _mm_set1_epi32(0x03020302);
left_y = _mm_shuffle_epi8(left, y_mask);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
dst += stride;
y_mask = _mm_set1_epi32(0x05040504);
left_y = _mm_shuffle_epi8(left, y_mask);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
dst += stride;
y_mask = _mm_set1_epi32(0x07060706);
left_y = _mm_shuffle_epi8(left, y_mask);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
}
void aom_smooth_h_predictor_16x8_ssse3(
- uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
const uint8_t *LIBAOM_RESTRICT top_row,
const uint8_t *LIBAOM_RESTRICT left_column) {
- const uint8_t *const top = (const uint8_t *)top_row;
- const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i top_right = _mm_set1_epi16(top_row[15]);
const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
const __m128i weights = LoadUnaligned16(smooth_weights + 12);
- __m128i scale = _mm_set1_epi16(256);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
const __m128i weights1 = cvtepu8_epi16(weights);
const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
@@ -1698,25 +1695,23 @@
_mm_mullo_epi16(inverted_weights1, top_right);
const __m128i scaled_top_right2 =
_mm_mullo_epi16(inverted_weights2, top_right);
- scale = _mm_set1_epi16(128);
- uint8_t *dst = (uint8_t *)dest;
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
const __m128i y_select = _mm_set1_epi32(y_mask);
const __m128i left_y = _mm_shuffle_epi8(left, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
dst += stride;
}
}
void aom_smooth_h_predictor_16x16_ssse3(
- uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
const uint8_t *LIBAOM_RESTRICT top_row,
const uint8_t *LIBAOM_RESTRICT left_column) {
- const uint8_t *const top = (const uint8_t *)top_row;
- const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i top_right = _mm_set1_epi16(top_row[15]);
const __m128i weights = LoadUnaligned16(smooth_weights + 12);
- __m128i scale = _mm_set1_epi16(256);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
const __m128i weights1 = cvtepu8_epi16(weights);
const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
@@ -1725,35 +1720,32 @@
_mm_mullo_epi16(inverted_weights1, top_right);
const __m128i scaled_top_right2 =
_mm_mullo_epi16(inverted_weights2, top_right);
- scale = _mm_set1_epi16(128);
- const uint8_t *const left_ptr = (const uint8_t *)left_column;
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
__m128i left = cvtepu8_epi16(LoadLo8(left_column));
- uint8_t *dst = (uint8_t *)dest;
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
const __m128i y_select = _mm_set1_epi32(y_mask);
const __m128i left_y = _mm_shuffle_epi8(left, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
dst += stride;
}
- left = cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ left = cvtepu8_epi16(LoadLo8(left_column + 8));
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
const __m128i y_select = _mm_set1_epi32(y_mask);
const __m128i left_y = _mm_shuffle_epi8(left, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
dst += stride;
}
}
void aom_smooth_h_predictor_16x32_ssse3(
- uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
const uint8_t *LIBAOM_RESTRICT top_row,
const uint8_t *LIBAOM_RESTRICT left_column) {
- const uint8_t *const top = (const uint8_t *)top_row;
- const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i top_right = _mm_set1_epi16(top_row[15]);
const __m128i weights = LoadUnaligned16(smooth_weights + 12);
- __m128i scale = _mm_set1_epi16(256);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
const __m128i weights1 = cvtepu8_epi16(weights);
const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
@@ -1762,51 +1754,48 @@
_mm_mullo_epi16(inverted_weights1, top_right);
const __m128i scaled_top_right2 =
_mm_mullo_epi16(inverted_weights2, top_right);
- scale = _mm_set1_epi16(128);
- const uint8_t *const left_ptr = (const uint8_t *)left_column;
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
__m128i left = cvtepu8_epi16(LoadLo8(left_column));
- uint8_t *dst = dest;
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
const __m128i y_select = _mm_set1_epi32(y_mask);
const __m128i left_y = _mm_shuffle_epi8(left, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
dst += stride;
}
- left = cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ left = cvtepu8_epi16(LoadLo8(left_column + 8));
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
const __m128i y_select = _mm_set1_epi32(y_mask);
const __m128i left_y = _mm_shuffle_epi8(left, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
dst += stride;
}
- left = cvtepu8_epi16(LoadLo8(left_ptr + 16));
+ left = cvtepu8_epi16(LoadLo8(left_column + 16));
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
const __m128i y_select = _mm_set1_epi32(y_mask);
const __m128i left_y = _mm_shuffle_epi8(left, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
dst += stride;
}
- left = cvtepu8_epi16(LoadLo8(left_ptr + 24));
+ left = cvtepu8_epi16(LoadLo8(left_column + 24));
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
const __m128i y_select = _mm_set1_epi32(y_mask);
const __m128i left_y = _mm_shuffle_epi8(left, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
dst += stride;
}
}
void aom_smooth_h_predictor_16x64_ssse3(
- uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
const uint8_t *LIBAOM_RESTRICT top_row,
const uint8_t *LIBAOM_RESTRICT left_column) {
- const uint8_t *const top = (const uint8_t *)top_row;
- const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i top_right = _mm_set1_epi16(top_row[15]);
const __m128i weights = LoadUnaligned16(smooth_weights + 12);
- __m128i scale = _mm_set1_epi16(256);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
const __m128i weights1 = cvtepu8_epi16(weights);
const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
@@ -1815,32 +1804,29 @@
_mm_mullo_epi16(inverted_weights1, top_right);
const __m128i scaled_top_right2 =
_mm_mullo_epi16(inverted_weights2, top_right);
- scale = _mm_set1_epi16(128);
- const uint8_t *const left_ptr = (const uint8_t *)left_column;
- uint8_t *dst = (uint8_t *)dest;
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
for (int left_offset = 0; left_offset < 64; left_offset += 8) {
- const __m128i left = cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+ const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
const __m128i y_select = _mm_set1_epi32(y_mask);
const __m128i left_y = _mm_shuffle_epi8(left, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
scaled_top_right1, scaled_top_right2,
- scale);
+ round);
dst += stride;
}
}
}
void aom_smooth_h_predictor_32x8_ssse3(
- uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
const uint8_t *LIBAOM_RESTRICT top_row,
const uint8_t *LIBAOM_RESTRICT left_column) {
- const uint8_t *const top = (const uint8_t *)top_row;
- const __m128i top_right = _mm_set1_epi16(top[31]);
+ const __m128i top_right = _mm_set1_epi16(top_row[31]);
const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
- __m128i scale = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
const __m128i weights1 = cvtepu8_epi16(weights_lo);
const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
const __m128i weights3 = cvtepu8_epi16(weights_hi);
@@ -1857,29 +1843,27 @@
_mm_mullo_epi16(inverted_weights3, top_right);
const __m128i scaled_top_right4 =
_mm_mullo_epi16(inverted_weights4, top_right);
- scale = _mm_set1_epi16((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
- uint8_t *dst = (uint8_t *)dest;
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
__m128i y_select = _mm_set1_epi32(y_mask);
__m128i left_y = _mm_shuffle_epi8(left, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
- scaled_top_right3, scaled_top_right4, scale);
+ scaled_top_right3, scaled_top_right4, round);
dst += stride;
}
}
void aom_smooth_h_predictor_32x16_ssse3(
- uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
const uint8_t *LIBAOM_RESTRICT top_row,
const uint8_t *LIBAOM_RESTRICT left_column) {
- const uint8_t *const top = (const uint8_t *)top_row;
- const __m128i top_right = _mm_set1_epi16(top[31]);
+ const __m128i top_right = _mm_set1_epi16(top_row[31]);
const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
- __m128i scale = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
const __m128i weights1 = cvtepu8_epi16(weights_lo);
const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
const __m128i weights3 = cvtepu8_epi16(weights_hi);
@@ -1896,15 +1880,14 @@
_mm_mullo_epi16(inverted_weights3, top_right);
const __m128i scaled_top_right4 =
_mm_mullo_epi16(inverted_weights4, top_right);
- scale = _mm_set1_epi16((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
- uint8_t *dst = (uint8_t *)dest;
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
__m128i y_select = _mm_set1_epi32(y_mask);
__m128i left_y = _mm_shuffle_epi8(left1, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
- scaled_top_right3, scaled_top_right4, scale);
+ scaled_top_right3, scaled_top_right4, round);
dst += stride;
}
const __m128i left2 =
@@ -1913,22 +1896,21 @@
__m128i y_select = _mm_set1_epi32(y_mask);
__m128i left_y = _mm_shuffle_epi8(left2, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
- scaled_top_right3, scaled_top_right4, scale);
+ scaled_top_right3, scaled_top_right4, round);
dst += stride;
}
}
void aom_smooth_h_predictor_32x32_ssse3(
- uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
const uint8_t *LIBAOM_RESTRICT top_row,
const uint8_t *LIBAOM_RESTRICT left_column) {
- const uint8_t *const top = (const uint8_t *)top_row;
- const __m128i top_right = _mm_set1_epi16(top[31]);
+ const __m128i top_right = _mm_set1_epi16(top_row[31]);
const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
- __m128i scale = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
const __m128i weights1 = cvtepu8_epi16(weights_lo);
const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
const __m128i weights3 = cvtepu8_epi16(weights_hi);
@@ -1945,60 +1927,57 @@
_mm_mullo_epi16(inverted_weights3, top_right);
const __m128i scaled_top_right4 =
_mm_mullo_epi16(inverted_weights4, top_right);
- scale = _mm_set1_epi16((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
- const uint8_t *const left_ptr = (const uint8_t *)left_column;
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
__m128i left = cvtepu8_epi16(LoadLo8(left_column));
- uint8_t *dst = (uint8_t *)dest;
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
__m128i y_select = _mm_set1_epi32(y_mask);
__m128i left_y = _mm_shuffle_epi8(left, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
- scaled_top_right3, scaled_top_right4, scale);
+ scaled_top_right3, scaled_top_right4, round);
dst += stride;
}
- left = cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ left = cvtepu8_epi16(LoadLo8(left_column + 8));
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
__m128i y_select = _mm_set1_epi32(y_mask);
__m128i left_y = _mm_shuffle_epi8(left, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
- scaled_top_right3, scaled_top_right4, scale);
+ scaled_top_right3, scaled_top_right4, round);
dst += stride;
}
- left = cvtepu8_epi16(LoadLo8(left_ptr + 16));
+ left = cvtepu8_epi16(LoadLo8(left_column + 16));
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
__m128i y_select = _mm_set1_epi32(y_mask);
__m128i left_y = _mm_shuffle_epi8(left, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
- scaled_top_right3, scaled_top_right4, scale);
+ scaled_top_right3, scaled_top_right4, round);
dst += stride;
}
- left = cvtepu8_epi16(LoadLo8(left_ptr + 24));
+ left = cvtepu8_epi16(LoadLo8(left_column + 24));
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
__m128i y_select = _mm_set1_epi32(y_mask);
__m128i left_y = _mm_shuffle_epi8(left, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
- scaled_top_right3, scaled_top_right4, scale);
+ scaled_top_right3, scaled_top_right4, round);
dst += stride;
}
}
void aom_smooth_h_predictor_32x64_ssse3(
- uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
const uint8_t *LIBAOM_RESTRICT top_row,
const uint8_t *LIBAOM_RESTRICT left_column) {
- const uint8_t *const top = (const uint8_t *)top_row;
- const __m128i top_right = _mm_set1_epi16(top[31]);
+ const __m128i top_right = _mm_set1_epi16(top_row[31]);
const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
- __m128i scale = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
const __m128i weights1 = cvtepu8_epi16(weights_lo);
const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
const __m128i weights3 = cvtepu8_epi16(weights_hi);
@@ -2015,35 +1994,32 @@
_mm_mullo_epi16(inverted_weights3, top_right);
const __m128i scaled_top_right4 =
_mm_mullo_epi16(inverted_weights4, top_right);
- scale = _mm_set1_epi16((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
- const uint8_t *const left_ptr = (const uint8_t *)left_column;
- uint8_t *dst = (uint8_t *)dest;
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
for (int left_offset = 0; left_offset < 64; left_offset += 8) {
- const __m128i left = cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+ const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
const __m128i y_select = _mm_set1_epi32(y_mask);
const __m128i left_y = _mm_shuffle_epi8(left, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
scaled_top_right1, scaled_top_right2,
- scale);
+ round);
write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
weights4, scaled_top_right3,
- scaled_top_right4, scale);
+ scaled_top_right4, round);
dst += stride;
}
}
}
void aom_smooth_h_predictor_64x16_ssse3(
- uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
const uint8_t *LIBAOM_RESTRICT top_row,
const uint8_t *LIBAOM_RESTRICT left_column) {
- const uint8_t *const top = (const uint8_t *)top_row;
- const __m128i top_right = _mm_set1_epi16(top[63]);
+ const __m128i top_right = _mm_set1_epi16(top_row[63]);
const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
- __m128i scale = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
const __m128i weights1 = cvtepu8_epi16(weights_lolo);
const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
const __m128i weights3 = cvtepu8_epi16(weights_lohi);
@@ -2078,48 +2054,45 @@
_mm_mullo_epi16(inverted_weights7, top_right);
const __m128i scaled_top_right8 =
_mm_mullo_epi16(inverted_weights8, top_right);
- scale = _mm_set1_epi16((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
- const uint8_t *const left_ptr = (const uint8_t *)left_column;
- uint8_t *dst = (uint8_t *)dest;
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
__m128i y_select = _mm_set1_epi32(y_mask);
__m128i left_y = _mm_shuffle_epi8(left1, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
- scaled_top_right3, scaled_top_right4, scale);
+ scaled_top_right3, scaled_top_right4, round);
write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
- scaled_top_right5, scaled_top_right6, scale);
+ scaled_top_right5, scaled_top_right6, round);
write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
- scaled_top_right7, scaled_top_right8, scale);
+ scaled_top_right7, scaled_top_right8, round);
dst += stride;
}
- const __m128i left2 = cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
__m128i y_select = _mm_set1_epi32(y_mask);
__m128i left_y = _mm_shuffle_epi8(left2, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
- scaled_top_right3, scaled_top_right4, scale);
+ scaled_top_right3, scaled_top_right4, round);
write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
- scaled_top_right5, scaled_top_right6, scale);
+ scaled_top_right5, scaled_top_right6, round);
write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
- scaled_top_right7, scaled_top_right8, scale);
+ scaled_top_right7, scaled_top_right8, round);
dst += stride;
}
}
void aom_smooth_h_predictor_64x32_ssse3(
- uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
const uint8_t *LIBAOM_RESTRICT top_row,
const uint8_t *LIBAOM_RESTRICT left_column) {
- const uint8_t *const top = (const uint8_t *)top_row;
- const __m128i top_right = _mm_set1_epi16(top[63]);
+ const __m128i top_right = _mm_set1_epi16(top_row[63]);
const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
- __m128i scale = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
const __m128i weights1 = cvtepu8_epi16(weights_lolo);
const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
const __m128i weights3 = cvtepu8_epi16(weights_lohi);
@@ -2154,75 +2127,72 @@
_mm_mullo_epi16(inverted_weights7, top_right);
const __m128i scaled_top_right8 =
_mm_mullo_epi16(inverted_weights8, top_right);
- scale = _mm_set1_epi16((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
- uint8_t *dst = (uint8_t *)dest;
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
const __m128i y_select = _mm_set1_epi32(y_mask);
const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
- scaled_top_right3, scaled_top_right4, scale);
+ scaled_top_right3, scaled_top_right4, round);
write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
- scaled_top_right5, scaled_top_right6, scale);
+ scaled_top_right5, scaled_top_right6, round);
write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
- scaled_top_right7, scaled_top_right8, scale);
+ scaled_top_right7, scaled_top_right8, round);
dst += stride;
}
- const uint8_t *const left_ptr = (const uint8_t *)left_column;
- const __m128i left2 = cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
const __m128i y_select = _mm_set1_epi32(y_mask);
const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
- scaled_top_right3, scaled_top_right4, scale);
+ scaled_top_right3, scaled_top_right4, round);
write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
- scaled_top_right5, scaled_top_right6, scale);
+ scaled_top_right5, scaled_top_right6, round);
write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
- scaled_top_right7, scaled_top_right8, scale);
+ scaled_top_right7, scaled_top_right8, round);
dst += stride;
}
- const __m128i left3 = cvtepu8_epi16(LoadLo8(left_ptr + 16));
+ const __m128i left3 = cvtepu8_epi16(LoadLo8(left_column + 16));
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
const __m128i y_select = _mm_set1_epi32(y_mask);
const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
- scaled_top_right3, scaled_top_right4, scale);
+ scaled_top_right3, scaled_top_right4, round);
write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
- scaled_top_right5, scaled_top_right6, scale);
+ scaled_top_right5, scaled_top_right6, round);
write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
- scaled_top_right7, scaled_top_right8, scale);
+ scaled_top_right7, scaled_top_right8, round);
dst += stride;
}
- const __m128i left4 = cvtepu8_epi16(LoadLo8(left_ptr + 24));
+ const __m128i left4 = cvtepu8_epi16(LoadLo8(left_column + 24));
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
const __m128i y_select = _mm_set1_epi32(y_mask);
const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
- scaled_top_right1, scaled_top_right2, scale);
+ scaled_top_right1, scaled_top_right2, round);
write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
- scaled_top_right3, scaled_top_right4, scale);
+ scaled_top_right3, scaled_top_right4, round);
write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
- scaled_top_right5, scaled_top_right6, scale);
+ scaled_top_right5, scaled_top_right6, round);
write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
- scaled_top_right7, scaled_top_right8, scale);
+ scaled_top_right7, scaled_top_right8, round);
dst += stride;
}
}
void aom_smooth_h_predictor_64x64_ssse3(
- uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
const uint8_t *LIBAOM_RESTRICT top_row,
const uint8_t *LIBAOM_RESTRICT left_column) {
- const uint8_t *const top = (const uint8_t *)top_row;
- const __m128i top_right = _mm_set1_epi16(top[63]);
+ const __m128i top_right = _mm_set1_epi16(top_row[63]);
const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
- __m128i scale = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
const __m128i weights1 = cvtepu8_epi16(weights_lolo);
const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
const __m128i weights3 = cvtepu8_epi16(weights_lohi);
@@ -2257,26 +2227,24 @@
_mm_mullo_epi16(inverted_weights7, top_right);
const __m128i scaled_top_right8 =
_mm_mullo_epi16(inverted_weights8, top_right);
- scale = _mm_set1_epi16((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
- const uint8_t *const left_ptr = (const uint8_t *)left_column;
- uint8_t *dst = (uint8_t *)dest;
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
for (int left_offset = 0; left_offset < 64; left_offset += 8) {
- const __m128i left = cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+ const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
const __m128i y_select = _mm_set1_epi32(y_mask);
const __m128i left_y = _mm_shuffle_epi8(left, y_select);
write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
scaled_top_right1, scaled_top_right2,
- scale);
+ round);
write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
weights4, scaled_top_right3,
- scaled_top_right4, scale);
+ scaled_top_right4, round);
write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5,
weights6, scaled_top_right5,
- scaled_top_right6, scale);
+ scaled_top_right6, round);
write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7,
weights8, scaled_top_right7,
- scaled_top_right8, scale);
+ scaled_top_right8, round);
dst += stride;
}
}