[x86]: aom_smooth_h_predictor_{16,32,64} cosmetics.

Change-Id: I19b3e287ca16c85e9bffaef2ab941905214f78a7
diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c
index 9af5e78..5482ef5 100644
--- a/aom_dsp/x86/intrapred_ssse3.c
+++ b/aom_dsp/x86/intrapred_ssse3.c
@@ -1642,14 +1642,13 @@
 }
 
 void aom_smooth_h_predictor_16x4_ssse3(
-    uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
     const uint8_t *LIBAOM_RESTRICT top_row,
     const uint8_t *LIBAOM_RESTRICT left_column) {
-  const uint8_t *const top = (const uint8_t *)top_row;
-  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i top_right = _mm_set1_epi16(top_row[15]);
   const __m128i left = cvtepu8_epi16(Load4(left_column));
   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
-  __m128i scale = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   const __m128i weights1 = cvtepu8_epi16(weights);
   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
@@ -1658,38 +1657,36 @@
       _mm_mullo_epi16(inverted_weights1, top_right);
   const __m128i scaled_top_right2 =
       _mm_mullo_epi16(inverted_weights2, top_right);
-  scale = _mm_set1_epi16((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   __m128i y_mask = _mm_set1_epi32(0x01000100);
   __m128i left_y = _mm_shuffle_epi8(left, y_mask);
-  uint8_t *dst = (uint8_t *)dest;
   write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                 scaled_top_right1, scaled_top_right2, scale);
+                                 scaled_top_right1, scaled_top_right2, round);
   dst += stride;
   y_mask = _mm_set1_epi32(0x03020302);
   left_y = _mm_shuffle_epi8(left, y_mask);
   write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                 scaled_top_right1, scaled_top_right2, scale);
+                                 scaled_top_right1, scaled_top_right2, round);
   dst += stride;
   y_mask = _mm_set1_epi32(0x05040504);
   left_y = _mm_shuffle_epi8(left, y_mask);
   write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                 scaled_top_right1, scaled_top_right2, scale);
+                                 scaled_top_right1, scaled_top_right2, round);
   dst += stride;
   y_mask = _mm_set1_epi32(0x07060706);
   left_y = _mm_shuffle_epi8(left, y_mask);
   write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                 scaled_top_right1, scaled_top_right2, scale);
+                                 scaled_top_right1, scaled_top_right2, round);
 }
 
 void aom_smooth_h_predictor_16x8_ssse3(
-    uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
     const uint8_t *LIBAOM_RESTRICT top_row,
     const uint8_t *LIBAOM_RESTRICT left_column) {
-  const uint8_t *const top = (const uint8_t *)top_row;
-  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i top_right = _mm_set1_epi16(top_row[15]);
   const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
-  __m128i scale = _mm_set1_epi16(256);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   const __m128i weights1 = cvtepu8_epi16(weights);
   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
@@ -1698,25 +1695,23 @@
       _mm_mullo_epi16(inverted_weights1, top_right);
   const __m128i scaled_top_right2 =
       _mm_mullo_epi16(inverted_weights2, top_right);
-  scale = _mm_set1_epi16(128);
-  uint8_t *dst = (uint8_t *)dest;
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
     const __m128i y_select = _mm_set1_epi32(y_mask);
     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     dst += stride;
   }
 }
 
 void aom_smooth_h_predictor_16x16_ssse3(
-    uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
     const uint8_t *LIBAOM_RESTRICT top_row,
     const uint8_t *LIBAOM_RESTRICT left_column) {
-  const uint8_t *const top = (const uint8_t *)top_row;
-  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i top_right = _mm_set1_epi16(top_row[15]);
   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
-  __m128i scale = _mm_set1_epi16(256);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   const __m128i weights1 = cvtepu8_epi16(weights);
   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
@@ -1725,35 +1720,32 @@
       _mm_mullo_epi16(inverted_weights1, top_right);
   const __m128i scaled_top_right2 =
       _mm_mullo_epi16(inverted_weights2, top_right);
-  scale = _mm_set1_epi16(128);
-  const uint8_t *const left_ptr = (const uint8_t *)left_column;
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   __m128i left = cvtepu8_epi16(LoadLo8(left_column));
-  uint8_t *dst = (uint8_t *)dest;
   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
     const __m128i y_select = _mm_set1_epi32(y_mask);
     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     dst += stride;
   }
-  left = cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  left = cvtepu8_epi16(LoadLo8(left_column + 8));
   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
     const __m128i y_select = _mm_set1_epi32(y_mask);
     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     dst += stride;
   }
 }
 
 void aom_smooth_h_predictor_16x32_ssse3(
-    uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
     const uint8_t *LIBAOM_RESTRICT top_row,
     const uint8_t *LIBAOM_RESTRICT left_column) {
-  const uint8_t *const top = (const uint8_t *)top_row;
-  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i top_right = _mm_set1_epi16(top_row[15]);
   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
-  __m128i scale = _mm_set1_epi16(256);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   const __m128i weights1 = cvtepu8_epi16(weights);
   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
@@ -1762,51 +1754,48 @@
       _mm_mullo_epi16(inverted_weights1, top_right);
   const __m128i scaled_top_right2 =
       _mm_mullo_epi16(inverted_weights2, top_right);
-  scale = _mm_set1_epi16(128);
-  const uint8_t *const left_ptr = (const uint8_t *)left_column;
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   __m128i left = cvtepu8_epi16(LoadLo8(left_column));
-  uint8_t *dst = dest;
   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
     const __m128i y_select = _mm_set1_epi32(y_mask);
     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     dst += stride;
   }
-  left = cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  left = cvtepu8_epi16(LoadLo8(left_column + 8));
   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
     const __m128i y_select = _mm_set1_epi32(y_mask);
     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     dst += stride;
   }
-  left = cvtepu8_epi16(LoadLo8(left_ptr + 16));
+  left = cvtepu8_epi16(LoadLo8(left_column + 16));
   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
     const __m128i y_select = _mm_set1_epi32(y_mask);
     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     dst += stride;
   }
-  left = cvtepu8_epi16(LoadLo8(left_ptr + 24));
+  left = cvtepu8_epi16(LoadLo8(left_column + 24));
   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
     const __m128i y_select = _mm_set1_epi32(y_mask);
     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     dst += stride;
   }
 }
 
 void aom_smooth_h_predictor_16x64_ssse3(
-    uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
     const uint8_t *LIBAOM_RESTRICT top_row,
     const uint8_t *LIBAOM_RESTRICT left_column) {
-  const uint8_t *const top = (const uint8_t *)top_row;
-  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i top_right = _mm_set1_epi16(top_row[15]);
   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
-  __m128i scale = _mm_set1_epi16(256);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   const __m128i weights1 = cvtepu8_epi16(weights);
   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
@@ -1815,32 +1804,29 @@
       _mm_mullo_epi16(inverted_weights1, top_right);
   const __m128i scaled_top_right2 =
       _mm_mullo_epi16(inverted_weights2, top_right);
-  scale = _mm_set1_epi16(128);
-  const uint8_t *const left_ptr = (const uint8_t *)left_column;
-  uint8_t *dst = (uint8_t *)dest;
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   for (int left_offset = 0; left_offset < 64; left_offset += 8) {
-    const __m128i left = cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
       const __m128i y_select = _mm_set1_epi32(y_mask);
       const __m128i left_y = _mm_shuffle_epi8(left, y_select);
       write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
                                      scaled_top_right1, scaled_top_right2,
-                                     scale);
+                                     round);
       dst += stride;
     }
   }
 }
 
 void aom_smooth_h_predictor_32x8_ssse3(
-    uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
     const uint8_t *LIBAOM_RESTRICT top_row,
     const uint8_t *LIBAOM_RESTRICT left_column) {
-  const uint8_t *const top = (const uint8_t *)top_row;
-  const __m128i top_right = _mm_set1_epi16(top[31]);
+  const __m128i top_right = _mm_set1_epi16(top_row[31]);
   const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
-  __m128i scale = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   const __m128i weights1 = cvtepu8_epi16(weights_lo);
   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
   const __m128i weights3 = cvtepu8_epi16(weights_hi);
@@ -1857,29 +1843,27 @@
       _mm_mullo_epi16(inverted_weights3, top_right);
   const __m128i scaled_top_right4 =
       _mm_mullo_epi16(inverted_weights4, top_right);
-  scale = _mm_set1_epi16((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
-  uint8_t *dst = (uint8_t *)dest;
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
     __m128i y_select = _mm_set1_epi32(y_mask);
     __m128i left_y = _mm_shuffle_epi8(left, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
-                                   scaled_top_right3, scaled_top_right4, scale);
+                                   scaled_top_right3, scaled_top_right4, round);
     dst += stride;
   }
 }
 
 void aom_smooth_h_predictor_32x16_ssse3(
-    uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
     const uint8_t *LIBAOM_RESTRICT top_row,
     const uint8_t *LIBAOM_RESTRICT left_column) {
-  const uint8_t *const top = (const uint8_t *)top_row;
-  const __m128i top_right = _mm_set1_epi16(top[31]);
+  const __m128i top_right = _mm_set1_epi16(top_row[31]);
   const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
-  __m128i scale = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   const __m128i weights1 = cvtepu8_epi16(weights_lo);
   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
   const __m128i weights3 = cvtepu8_epi16(weights_hi);
@@ -1896,15 +1880,14 @@
       _mm_mullo_epi16(inverted_weights3, top_right);
   const __m128i scaled_top_right4 =
       _mm_mullo_epi16(inverted_weights4, top_right);
-  scale = _mm_set1_epi16((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
-  uint8_t *dst = (uint8_t *)dest;
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
     __m128i y_select = _mm_set1_epi32(y_mask);
     __m128i left_y = _mm_shuffle_epi8(left1, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
-                                   scaled_top_right3, scaled_top_right4, scale);
+                                   scaled_top_right3, scaled_top_right4, round);
     dst += stride;
   }
   const __m128i left2 =
@@ -1913,22 +1896,21 @@
     __m128i y_select = _mm_set1_epi32(y_mask);
     __m128i left_y = _mm_shuffle_epi8(left2, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
-                                   scaled_top_right3, scaled_top_right4, scale);
+                                   scaled_top_right3, scaled_top_right4, round);
     dst += stride;
   }
 }
 
 void aom_smooth_h_predictor_32x32_ssse3(
-    uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
     const uint8_t *LIBAOM_RESTRICT top_row,
     const uint8_t *LIBAOM_RESTRICT left_column) {
-  const uint8_t *const top = (const uint8_t *)top_row;
-  const __m128i top_right = _mm_set1_epi16(top[31]);
+  const __m128i top_right = _mm_set1_epi16(top_row[31]);
   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
-  __m128i scale = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   const __m128i weights1 = cvtepu8_epi16(weights_lo);
   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
   const __m128i weights3 = cvtepu8_epi16(weights_hi);
@@ -1945,60 +1927,57 @@
       _mm_mullo_epi16(inverted_weights3, top_right);
   const __m128i scaled_top_right4 =
       _mm_mullo_epi16(inverted_weights4, top_right);
-  scale = _mm_set1_epi16((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
-  const uint8_t *const left_ptr = (const uint8_t *)left_column;
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   __m128i left = cvtepu8_epi16(LoadLo8(left_column));
-  uint8_t *dst = (uint8_t *)dest;
   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
     __m128i y_select = _mm_set1_epi32(y_mask);
     __m128i left_y = _mm_shuffle_epi8(left, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
-                                   scaled_top_right3, scaled_top_right4, scale);
+                                   scaled_top_right3, scaled_top_right4, round);
     dst += stride;
   }
-  left = cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  left = cvtepu8_epi16(LoadLo8(left_column + 8));
   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
     __m128i y_select = _mm_set1_epi32(y_mask);
     __m128i left_y = _mm_shuffle_epi8(left, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
-                                   scaled_top_right3, scaled_top_right4, scale);
+                                   scaled_top_right3, scaled_top_right4, round);
     dst += stride;
   }
-  left = cvtepu8_epi16(LoadLo8(left_ptr + 16));
+  left = cvtepu8_epi16(LoadLo8(left_column + 16));
   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
     __m128i y_select = _mm_set1_epi32(y_mask);
     __m128i left_y = _mm_shuffle_epi8(left, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
-                                   scaled_top_right3, scaled_top_right4, scale);
+                                   scaled_top_right3, scaled_top_right4, round);
     dst += stride;
   }
-  left = cvtepu8_epi16(LoadLo8(left_ptr + 24));
+  left = cvtepu8_epi16(LoadLo8(left_column + 24));
   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
     __m128i y_select = _mm_set1_epi32(y_mask);
     __m128i left_y = _mm_shuffle_epi8(left, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
-                                   scaled_top_right3, scaled_top_right4, scale);
+                                   scaled_top_right3, scaled_top_right4, round);
     dst += stride;
   }
 }
 
 void aom_smooth_h_predictor_32x64_ssse3(
-    uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
     const uint8_t *LIBAOM_RESTRICT top_row,
     const uint8_t *LIBAOM_RESTRICT left_column) {
-  const uint8_t *const top = (const uint8_t *)top_row;
-  const __m128i top_right = _mm_set1_epi16(top[31]);
+  const __m128i top_right = _mm_set1_epi16(top_row[31]);
   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
-  __m128i scale = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   const __m128i weights1 = cvtepu8_epi16(weights_lo);
   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
   const __m128i weights3 = cvtepu8_epi16(weights_hi);
@@ -2015,35 +1994,32 @@
       _mm_mullo_epi16(inverted_weights3, top_right);
   const __m128i scaled_top_right4 =
       _mm_mullo_epi16(inverted_weights4, top_right);
-  scale = _mm_set1_epi16((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
-  const uint8_t *const left_ptr = (const uint8_t *)left_column;
-  uint8_t *dst = (uint8_t *)dest;
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   for (int left_offset = 0; left_offset < 64; left_offset += 8) {
-    const __m128i left = cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
       const __m128i y_select = _mm_set1_epi32(y_mask);
       const __m128i left_y = _mm_shuffle_epi8(left, y_select);
       write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
                                      scaled_top_right1, scaled_top_right2,
-                                     scale);
+                                     round);
       write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
                                      weights4, scaled_top_right3,
-                                     scaled_top_right4, scale);
+                                     scaled_top_right4, round);
       dst += stride;
     }
   }
 }
 
 void aom_smooth_h_predictor_64x16_ssse3(
-    uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
     const uint8_t *LIBAOM_RESTRICT top_row,
     const uint8_t *LIBAOM_RESTRICT left_column) {
-  const uint8_t *const top = (const uint8_t *)top_row;
-  const __m128i top_right = _mm_set1_epi16(top[63]);
+  const __m128i top_right = _mm_set1_epi16(top_row[63]);
   const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
   const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
   const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
-  __m128i scale = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   const __m128i weights1 = cvtepu8_epi16(weights_lolo);
   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
   const __m128i weights3 = cvtepu8_epi16(weights_lohi);
@@ -2078,48 +2054,45 @@
       _mm_mullo_epi16(inverted_weights7, top_right);
   const __m128i scaled_top_right8 =
       _mm_mullo_epi16(inverted_weights8, top_right);
-  scale = _mm_set1_epi16((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
-  const uint8_t *const left_ptr = (const uint8_t *)left_column;
-  uint8_t *dst = (uint8_t *)dest;
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
     __m128i y_select = _mm_set1_epi32(y_mask);
     __m128i left_y = _mm_shuffle_epi8(left1, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
-                                   scaled_top_right3, scaled_top_right4, scale);
+                                   scaled_top_right3, scaled_top_right4, round);
     write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
-                                   scaled_top_right5, scaled_top_right6, scale);
+                                   scaled_top_right5, scaled_top_right6, round);
     write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
-                                   scaled_top_right7, scaled_top_right8, scale);
+                                   scaled_top_right7, scaled_top_right8, round);
     dst += stride;
   }
-  const __m128i left2 = cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
     __m128i y_select = _mm_set1_epi32(y_mask);
     __m128i left_y = _mm_shuffle_epi8(left2, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
-                                   scaled_top_right3, scaled_top_right4, scale);
+                                   scaled_top_right3, scaled_top_right4, round);
     write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
-                                   scaled_top_right5, scaled_top_right6, scale);
+                                   scaled_top_right5, scaled_top_right6, round);
     write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
-                                   scaled_top_right7, scaled_top_right8, scale);
+                                   scaled_top_right7, scaled_top_right8, round);
     dst += stride;
   }
 }
 
 void aom_smooth_h_predictor_64x32_ssse3(
-    uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
     const uint8_t *LIBAOM_RESTRICT top_row,
     const uint8_t *LIBAOM_RESTRICT left_column) {
-  const uint8_t *const top = (const uint8_t *)top_row;
-  const __m128i top_right = _mm_set1_epi16(top[63]);
+  const __m128i top_right = _mm_set1_epi16(top_row[63]);
   const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
   const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
   const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
-  __m128i scale = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   const __m128i weights1 = cvtepu8_epi16(weights_lolo);
   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
   const __m128i weights3 = cvtepu8_epi16(weights_lohi);
@@ -2154,75 +2127,72 @@
       _mm_mullo_epi16(inverted_weights7, top_right);
   const __m128i scaled_top_right8 =
       _mm_mullo_epi16(inverted_weights8, top_right);
-  scale = _mm_set1_epi16((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
-  uint8_t *dst = (uint8_t *)dest;
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
     const __m128i y_select = _mm_set1_epi32(y_mask);
     const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
-                                   scaled_top_right3, scaled_top_right4, scale);
+                                   scaled_top_right3, scaled_top_right4, round);
     write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
-                                   scaled_top_right5, scaled_top_right6, scale);
+                                   scaled_top_right5, scaled_top_right6, round);
     write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
-                                   scaled_top_right7, scaled_top_right8, scale);
+                                   scaled_top_right7, scaled_top_right8, round);
     dst += stride;
   }
-  const uint8_t *const left_ptr = (const uint8_t *)left_column;
-  const __m128i left2 = cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
     const __m128i y_select = _mm_set1_epi32(y_mask);
     const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
-                                   scaled_top_right3, scaled_top_right4, scale);
+                                   scaled_top_right3, scaled_top_right4, round);
     write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
-                                   scaled_top_right5, scaled_top_right6, scale);
+                                   scaled_top_right5, scaled_top_right6, round);
     write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
-                                   scaled_top_right7, scaled_top_right8, scale);
+                                   scaled_top_right7, scaled_top_right8, round);
     dst += stride;
   }
-  const __m128i left3 = cvtepu8_epi16(LoadLo8(left_ptr + 16));
+  const __m128i left3 = cvtepu8_epi16(LoadLo8(left_column + 16));
   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
     const __m128i y_select = _mm_set1_epi32(y_mask);
     const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
-                                   scaled_top_right3, scaled_top_right4, scale);
+                                   scaled_top_right3, scaled_top_right4, round);
     write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
-                                   scaled_top_right5, scaled_top_right6, scale);
+                                   scaled_top_right5, scaled_top_right6, round);
     write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
-                                   scaled_top_right7, scaled_top_right8, scale);
+                                   scaled_top_right7, scaled_top_right8, round);
     dst += stride;
   }
-  const __m128i left4 = cvtepu8_epi16(LoadLo8(left_ptr + 24));
+  const __m128i left4 = cvtepu8_epi16(LoadLo8(left_column + 24));
   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
     const __m128i y_select = _mm_set1_epi32(y_mask);
     const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
-                                   scaled_top_right1, scaled_top_right2, scale);
+                                   scaled_top_right1, scaled_top_right2, round);
     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
-                                   scaled_top_right3, scaled_top_right4, scale);
+                                   scaled_top_right3, scaled_top_right4, round);
     write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
-                                   scaled_top_right5, scaled_top_right6, scale);
+                                   scaled_top_right5, scaled_top_right6, round);
     write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
-                                   scaled_top_right7, scaled_top_right8, scale);
+                                   scaled_top_right7, scaled_top_right8, round);
     dst += stride;
   }
 }
 
 void aom_smooth_h_predictor_64x64_ssse3(
-    uint8_t *LIBAOM_RESTRICT dest, ptrdiff_t stride,
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
     const uint8_t *LIBAOM_RESTRICT top_row,
     const uint8_t *LIBAOM_RESTRICT left_column) {
-  const uint8_t *const top = (const uint8_t *)top_row;
-  const __m128i top_right = _mm_set1_epi16(top[63]);
+  const __m128i top_right = _mm_set1_epi16(top_row[63]);
   const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
   const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
-  __m128i scale = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   const __m128i weights1 = cvtepu8_epi16(weights_lolo);
   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
   const __m128i weights3 = cvtepu8_epi16(weights_lohi);
@@ -2257,26 +2227,24 @@
       _mm_mullo_epi16(inverted_weights7, top_right);
   const __m128i scaled_top_right8 =
       _mm_mullo_epi16(inverted_weights8, top_right);
-  scale = _mm_set1_epi16((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
-  const uint8_t *const left_ptr = (const uint8_t *)left_column;
-  uint8_t *dst = (uint8_t *)dest;
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   for (int left_offset = 0; left_offset < 64; left_offset += 8) {
-    const __m128i left = cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
       const __m128i y_select = _mm_set1_epi32(y_mask);
       const __m128i left_y = _mm_shuffle_epi8(left, y_select);
       write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
                                      scaled_top_right1, scaled_top_right2,
-                                     scale);
+                                     round);
       write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
                                      weights4, scaled_top_right3,
-                                     scaled_top_right4, scale);
+                                     scaled_top_right4, round);
       write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5,
                                      weights6, scaled_top_right5,
-                                     scaled_top_right6, scale);
+                                     scaled_top_right6, round);
       write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7,
                                      weights8, scaled_top_right7,
-                                     scaled_top_right8, scale);
+                                     scaled_top_right8, round);
       dst += stride;
     }
   }