x86: normalize types used with _mm{,256}_set?_epi8

w/clang -fsanitize=integer fixes warnings of the form:
implicit conversion from type 'uint8_t' (aka 'unsigned char') of value
128 (8-bit, unsigned) to type 'char' changed the value to -128 (8-bit,
signed)

Bug: aomedia:3136
Bug: b/229626362
Change-Id: I1d42bd4971a58c081d1a9b5a7f2d78a3281026b3
diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h
index 3cc6c02..dbd2442 100644
--- a/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/aom_dsp/simd/v128_intrinsics_x86.h
@@ -81,7 +81,7 @@
 
 SIMD_INLINE v128 v128_zero() { return _mm_setzero_si128(); }
 
-SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
+SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); }
 
 SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
 
@@ -534,7 +534,7 @@
 SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
 
 SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
-  return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
+  return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)),
                        _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
 }
 
@@ -593,9 +593,9 @@
 #define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
 #define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
 #define v128_shl_n_8(a, c) \
-  _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
+  _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c))
 #define v128_shr_n_u8(a, c) \
-  _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
+  _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c))
 #define v128_shr_n_s8(a, c)                                         \
   _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \
                   _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8))
diff --git a/aom_dsp/simd/v256_intrinsics_x86.h b/aom_dsp/simd/v256_intrinsics_x86.h
index e10846b..10f4f8a 100644
--- a/aom_dsp/simd/v256_intrinsics_x86.h
+++ b/aom_dsp/simd/v256_intrinsics_x86.h
@@ -78,7 +78,7 @@
 
 SIMD_INLINE v256 v256_zero(void) { return _mm256_setzero_si256(); }
 
-SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8(x); }
+SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8((char)x); }
 
 SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); }
 
@@ -596,7 +596,7 @@
 }
 
 SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
-  return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)),
+  return _mm256_and_si256(_mm256_set1_epi8((char)(0xff << c)),
                           _mm256_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
 }
 
@@ -677,11 +677,12 @@
 #define v256_align(a, b, c) \
   ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
 
-#define v256_shl_n_8(a, c)                                   \
-  _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << (c))), \
+#define v256_shl_n_8(a, c)                                \
+  _mm256_and_si256(_mm256_set1_epi8((char)(0xff << (c))), \
                    _mm256_slli_epi16(a, c))
-#define v256_shr_n_u8(a, c) \
-  _mm256_and_si256(_mm256_set1_epi8(0xff >> (c)), _mm256_srli_epi16(a, c))
+#define v256_shr_n_u8(a, c)                               \
+  _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> (c))), \
+                   _mm256_srli_epi16(a, c))
 #define v256_shr_n_s8(a, c)                                                  \
   _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \
                      _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8))
diff --git a/aom_dsp/simd/v64_intrinsics_x86.h b/aom_dsp/simd/v64_intrinsics_x86.h
index 42b602b..95a8765 100644
--- a/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/aom_dsp/simd/v64_intrinsics_x86.h
@@ -101,7 +101,7 @@
 
 SIMD_INLINE v64 v64_zero(void) { return _mm_setzero_si128(); }
 
-SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
+SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); }
 
 SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
 
@@ -433,7 +433,7 @@
 SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
 
 SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
-  return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
+  return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)),
                        _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
 }
 
@@ -477,9 +477,9 @@
 #define v64_shl_n_byte(a, c) _mm_slli_si128(a, c)
 #define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8)
 #define v64_shl_n_8(a, c) \
-  _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
+  _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c))
 #define v64_shr_n_u8(a, c) \
-  _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
+  _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c))
 #define v64_shr_n_s8(a, c) \
   _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a)
 #define v64_shl_n_16(a, c) _mm_slli_epi16(a, c)
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index b5f7144..597e3bd 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -361,7 +361,7 @@
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
   row_store_32xh(&row, 32, dst, stride);
 }
 
@@ -628,7 +628,7 @@
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
   row_store_32xh(&row, 16, dst, stride);
 }
 
@@ -637,7 +637,7 @@
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
   row_store_32xh(&row, 64, dst, stride);
 }
 
@@ -646,7 +646,7 @@
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
   row_store_64xh(&row, 64, dst, stride);
 }
 
@@ -655,7 +655,7 @@
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
   row_store_64xh(&row, 32, dst, stride);
 }
 
@@ -664,7 +664,7 @@
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
   row_store_64xh(&row, 16, dst, stride);
 }
 
@@ -3537,7 +3537,7 @@
   __m128i a_mbase_x;
 
   a16 = _mm256_set1_epi16(16);
-  a_mbase_x = _mm_set1_epi8(above[max_base_x]);
+  a_mbase_x = _mm_set1_epi8((int8_t)above[max_base_x]);
   c3f = _mm256_set1_epi16(0x3f);
 
   int x = dx;
@@ -3640,7 +3640,7 @@
   __m256i a_mbase_x, diff, c3f;
 
   a16 = _mm256_set1_epi16(16);
-  a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
+  a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
   c3f = _mm256_set1_epi16(0x3f);
 
   int x = dx;
@@ -3722,7 +3722,7 @@
   __m128i max_base_x128, base_inc128, mask128;
 
   a16 = _mm256_set1_epi16(16);
-  a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
+  a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
   max_base_x128 = _mm_set1_epi8(max_base_x);
   c3f = _mm256_set1_epi16(0x3f);
 
@@ -3766,14 +3766,14 @@
                      _mm256_extracti128_si256(res, 1)));  // 16 8bit values
 
         base_inc128 =
-            _mm_setr_epi8((uint8_t)(base + j), (uint8_t)(base + j + 1),
-                          (uint8_t)(base + j + 2), (uint8_t)(base + j + 3),
-                          (uint8_t)(base + j + 4), (uint8_t)(base + j + 5),
-                          (uint8_t)(base + j + 6), (uint8_t)(base + j + 7),
-                          (uint8_t)(base + j + 8), (uint8_t)(base + j + 9),
-                          (uint8_t)(base + j + 10), (uint8_t)(base + j + 11),
-                          (uint8_t)(base + j + 12), (uint8_t)(base + j + 13),
-                          (uint8_t)(base + j + 14), (uint8_t)(base + j + 15));
+            _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
+                          (int8_t)(base + j + 2), (int8_t)(base + j + 3),
+                          (int8_t)(base + j + 4), (int8_t)(base + j + 5),
+                          (int8_t)(base + j + 6), (int8_t)(base + j + 7),
+                          (int8_t)(base + j + 8), (int8_t)(base + j + 9),
+                          (int8_t)(base + j + 10), (int8_t)(base + j + 11),
+                          (int8_t)(base + j + 12), (int8_t)(base + j + 13),
+                          (int8_t)(base + j + 14), (int8_t)(base + j + 15));
 
         mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
                                  _mm_setzero_si128());
diff --git a/aom_dsp/x86/intrapred_sse2.c b/aom_dsp/x86/intrapred_sse2.c
index d42f185..ccbce1e 100644
--- a/aom_dsp/x86/intrapred_sse2.c
+++ b/aom_dsp/x86/intrapred_sse2.c
@@ -146,7 +146,7 @@
   sum += 6;
   sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
 
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  const __m128i row = _mm_set1_epi8((int8_t)sum);
   dc_store_8xh(&row, 4, dst, stride);
 }
 
@@ -743,7 +743,7 @@
                                    const uint8_t *above, const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_8xh(&row, 4, dst, stride);
 }
 
@@ -751,7 +751,7 @@
                                     const uint8_t *above, const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_8xh(&row, 16, dst, stride);
 }
 
@@ -759,7 +759,7 @@
                                     const uint8_t *above, const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_8xh(&row, 32, dst, stride);
 }
 
@@ -767,7 +767,7 @@
                                     const uint8_t *above, const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_16xh(&row, 4, dst, stride);
 }
 
@@ -775,7 +775,7 @@
                                     const uint8_t *above, const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_16xh(&row, 8, dst, stride);
 }
 
@@ -784,7 +784,7 @@
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_16xh(&row, 32, dst, stride);
 }
 
@@ -793,7 +793,7 @@
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_16xh(&row, 64, dst, stride);
 }
 
@@ -801,7 +801,7 @@
                                     const uint8_t *above, const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_32xh(&row, 8, dst, stride);
 }
 
@@ -810,7 +810,7 @@
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_32xh(&row, 16, dst, stride);
 }
 
@@ -819,7 +819,7 @@
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_32xh(&row, 64, dst, stride);
 }
 
@@ -828,7 +828,7 @@
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_64xh(&row, 64, dst, stride);
 }
 
@@ -837,7 +837,7 @@
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_64xh(&row, 32, dst, stride);
 }
 
@@ -846,7 +846,7 @@
                                      const uint8_t *left) {
   (void)above;
   (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  const __m128i row = _mm_set1_epi8((int8_t)128);
   dc_store_64xh(&row, 16, dst, stride);
 }
 
diff --git a/aom_dsp/x86/intrapred_sse4.c b/aom_dsp/x86/intrapred_sse4.c
index 21fb1bb..f2c6188 100644
--- a/aom_dsp/x86/intrapred_sse4.c
+++ b/aom_dsp/x86/intrapred_sse4.c
@@ -141,7 +141,7 @@
   __m128i a_mbase_x;
 
   a16 = _mm_set1_epi16(16);
-  a_mbase_x = _mm_set1_epi8(above[max_base_x]);
+  a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
   c3f = _mm_set1_epi16(0x3f);
 
   int x = dx;
@@ -255,7 +255,7 @@
   __m128i a_mbase_x, diff, c3f;
 
   a16 = _mm_set1_epi16(16);
-  a_mbase_x = _mm_set1_epi8(above[max_base_x]);
+  a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
   c3f = _mm_set1_epi16(0x3f);
 
   int x = dx;
@@ -353,7 +353,7 @@
   __m128i max_base, base_inc, mask;
 
   a16 = _mm_set1_epi16(16);
-  a_mbase_x = _mm_set1_epi8(above[max_base_x]);
+  a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
   max_base = _mm_set1_epi8(max_base_x);
   c3f = _mm_set1_epi16(0x3f);
 
@@ -412,14 +412,14 @@
         res = _mm_packus_epi16(res, res1);  // 16 8bit values
 
         base_inc =
-            _mm_setr_epi8((uint8_t)(base + j), (uint8_t)(base + j + 1),
-                          (uint8_t)(base + j + 2), (uint8_t)(base + j + 3),
-                          (uint8_t)(base + j + 4), (uint8_t)(base + j + 5),
-                          (uint8_t)(base + j + 6), (uint8_t)(base + j + 7),
-                          (uint8_t)(base + j + 8), (uint8_t)(base + j + 9),
-                          (uint8_t)(base + j + 10), (uint8_t)(base + j + 11),
-                          (uint8_t)(base + j + 12), (uint8_t)(base + j + 13),
-                          (uint8_t)(base + j + 14), (uint8_t)(base + j + 15));
+            _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
+                          (int8_t)(base + j + 2), (int8_t)(base + j + 3),
+                          (int8_t)(base + j + 4), (int8_t)(base + j + 5),
+                          (int8_t)(base + j + 6), (int8_t)(base + j + 7),
+                          (int8_t)(base + j + 8), (int8_t)(base + j + 9),
+                          (int8_t)(base + j + 10), (int8_t)(base + j + 11),
+                          (int8_t)(base + j + 12), (int8_t)(base + j + 13),
+                          (int8_t)(base + j + 14), (int8_t)(base + j + 15));
 
         mask = _mm_cmpgt_epi8(_mm_subs_epu8(max_base, base_inc),
                               _mm_setzero_si128());
diff --git a/aom_dsp/x86/jnt_variance_ssse3.c b/aom_dsp/x86/jnt_variance_ssse3.c
index 6ec5dd8..ee88e1e 100644
--- a/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/aom_dsp/x86/jnt_variance_ssse3.c
@@ -49,8 +49,8 @@
                                       int ref_stride,
                                       const DIST_WTD_COMP_PARAMS *jcp_param) {
   int i;
-  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
-  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+  const int8_t w0 = (int8_t)jcp_param->fwd_offset;
+  const int8_t w1 = (int8_t)jcp_param->bck_offset;
   const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
                                  w1, w0, w1, w0);
   const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
@@ -95,10 +95,10 @@
     assert(!(width & 3));
     assert(!(height & 3));
     for (i = 0; i < height; i += 4) {
-      const uint8_t *row0 = ref + 0 * ref_stride;
-      const uint8_t *row1 = ref + 1 * ref_stride;
-      const uint8_t *row2 = ref + 2 * ref_stride;
-      const uint8_t *row3 = ref + 3 * ref_stride;
+      const int8_t *row0 = (const int8_t *)ref + 0 * ref_stride;
+      const int8_t *row1 = (const int8_t *)ref + 1 * ref_stride;
+      const int8_t *row2 = (const int8_t *)ref + 2 * ref_stride;
+      const int8_t *row3 = (const int8_t *)ref + 3 * ref_stride;
 
       __m128i p0 =
           _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1],
diff --git a/aom_dsp/x86/variance_impl_ssse3.c b/aom_dsp/x86/variance_impl_ssse3.c
index 66b0d7d..6990021 100644
--- a/aom_dsp/x86/variance_impl_ssse3.c
+++ b/aom_dsp/x86/variance_impl_ssse3.c
@@ -25,8 +25,8 @@
   // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
   const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
   const __m128i r = _mm_set1_epi16(round);
-  const uint8_t f0 = filter[0] >> 1;
-  const uint8_t f1 = filter[1] >> 1;
+  const int8_t f0 = (int8_t)(filter[0] >> 1);
+  const int8_t f1 = (int8_t)(filter[1] >> 1);
   const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1,
                                         f0, f1, f0, f1, f0, f1);
   unsigned int i, j;
diff --git a/av1/common/x86/intra_edge_sse4.c b/av1/common/x86/intra_edge_sse4.c
index fc69f41..f025f79 100644
--- a/av1/common/x86/intra_edge_sse4.c
+++ b/av1/common/x86/intra_edge_sse4.c
@@ -33,7 +33,7 @@
 
   // Extend the first and last samples to simplify the loop for the 5-tap case
   p[-1] = p[0];
-  __m128i last = _mm_set1_epi8(p[sz - 1]);
+  __m128i last = _mm_set1_epi8((char)p[sz - 1]);
   _mm_storeu_si128((__m128i *)&p[sz], last);
 
   // Adjust input pointer for filter support area
diff --git a/av1/encoder/x86/reconinter_enc_ssse3.c b/av1/encoder/x86/reconinter_enc_ssse3.c
index 7ac0f0d..e3be996 100644
--- a/av1/encoder/x86/reconinter_enc_ssse3.c
+++ b/av1/encoder/x86/reconinter_enc_ssse3.c
@@ -48,8 +48,8 @@
   assert(!(width * height & 15));
   n = width * height >> 4;
 
-  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
-  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+  const int8_t w0 = (int8_t)jcp_param->fwd_offset;
+  const int8_t w1 = (int8_t)jcp_param->bck_offset;
   const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
                                  w1, w0, w1, w0);
   const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);