[CFL] Store reconstructed pixels as uint16 instead of int16

To avoid confusion between the reconstructed buffer and the AC buffer,
we change the reconstructed buffer to unsigned. This change does not
alter the encoder behavior.

Results on Subset1:
  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
0.0000 |  0.0000 |  0.0000 |   0.0000 | 0.0000 |  0.0000 |     0.0000

https://arewecompressedyet.com/?job=unsigned_recpn%402018-05-14T13%3A09%3A38.233Z&job=2buf_cfl%402018-05-14T13%3A10%3A23.765Z

Change-Id: I6943482c3c501a4ef50fe7020c7b16b252c368f1
diff --git a/av1/common/arm/cfl_neon.c b/av1/common/arm/cfl_neon.c
index 2b407fc..d731b6a 100644
--- a/av1/common/arm/cfl_neon.c
+++ b/av1/common/arm/cfl_neon.c
@@ -14,9 +14,10 @@
 
 #include "av1/common/cfl.h"
 
-static INLINE void vldsubstq_s16(int16_t *dst, const int16_t *src, int offset,
+static INLINE void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset,
                                  int16x8_t sub) {
-  vst1q_s16(dst + offset, vsubq_s16(vld1q_s16(src + offset), sub));
+  vst1q_s16(dst + offset,
+            vsubq_s16(vreinterpretq_s16_u16(vld1q_u16(src + offset)), sub));
 }
 
 static INLINE uint16x8_t vldaddq_u16(const uint16_t *buf, size_t offset) {
@@ -29,8 +30,8 @@
 }
 
 // Store half of a vector.
-static INLINE void vsth_s16(int16_t *ptr, int16x4_t val) {
-  *((uint32_t *)ptr) = vreinterpret_u32_s16(val)[0];
+static INLINE void vsth_u16(uint16_t *ptr, uint16x4_t val) {
+  *((uint32_t *)ptr) = vreinterpret_u32_u16(val)[0];
 }
 
 // Store half of a vector.
@@ -40,23 +41,23 @@
 
 static void cfl_luma_subsampling_420_lbd_neon(const uint8_t *input,
                                               int input_stride,
-                                              int16_t *pred_buf_q3, int width,
+                                              uint16_t *pred_buf_q3, int width,
                                               int height) {
-  const int16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
   const int luma_stride = input_stride << 1;
   do {
     if (width == 4) {
       const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
       const uint16x4_t sum = vpadal_u8(top, vldh_dup_u8(input + input_stride));
-      vsth_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(sum), 1));
+      vsth_u16(pred_buf_q3, vshl_n_u16(sum, 1));
     } else if (width == 8) {
       const uint16x4_t top = vpaddl_u8(vld1_u8(input));
       const uint16x4_t sum = vpadal_u8(top, vld1_u8(input + input_stride));
-      vst1_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(sum), 1));
+      vst1_u16(pred_buf_q3, vshl_n_u16(sum, 1));
     } else if (width == 16) {
       const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
       const uint16x8_t sum = vpadalq_u8(top, vld1q_u8(input + input_stride));
-      vst1q_s16(pred_buf_q3, vshlq_n_s16(vreinterpretq_s16_u16(sum), 1));
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(sum, 1));
     } else {
       const uint8x8x4_t top = vld4_u8(input);
       const uint8x8x4_t bot = vld4_u8(input + input_stride);
@@ -68,12 +69,10 @@
       const uint16x8_t top_1 = vaddl_u8(top.val[2], top.val[3]);
       // equivalent to a vpaddlq_u8 (because vld4q interleaves)
       const uint16x8_t bot_1 = vaddl_u8(bot.val[2], bot.val[3]);
-      int16x8x2_t sum;
-      sum.val[0] =
-          vshlq_n_s16(vreinterpretq_s16_u16(vaddq_u16(top_0, bot_0)), 1);
-      sum.val[1] =
-          vshlq_n_s16(vreinterpretq_s16_u16(vaddq_u16(top_1, bot_1)), 1);
-      vst2q_s16(pred_buf_q3, sum);
+      uint16x8x2_t sum;
+      sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1);
+      sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1);
+      vst2q_u16(pred_buf_q3, sum);
     }
     input += luma_stride;
   } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
@@ -81,29 +80,26 @@
 
 static void cfl_luma_subsampling_422_lbd_neon(const uint8_t *input,
                                               int input_stride,
-                                              int16_t *pred_buf_q3, int width,
+                                              uint16_t *pred_buf_q3, int width,
                                               int height) {
-  const int16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
   do {
     if (width == 4) {
       const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
-      vsth_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(top), 2));
+      vsth_u16(pred_buf_q3, vshl_n_u16(top, 2));
     } else if (width == 8) {
       const uint16x4_t top = vpaddl_u8(vld1_u8(input));
-      vst1_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(top), 2));
+      vst1_u16(pred_buf_q3, vshl_n_u16(top, 2));
     } else if (width == 16) {
       const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
-      vst1q_s16(pred_buf_q3, vshlq_n_s16(vreinterpretq_s16_u16(top), 2));
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 2));
     } else {
       const uint8x8x4_t top = vld4_u8(input);
-      int16x8x2_t sum;
-      sum.val[0] = vshlq_n_s16(
-          // vaddl_u8 is equivalent to a vpaddlq_u8 (because vld4q interleaves)
-          vreinterpretq_s16_u16(vaddl_u8(top.val[0], top.val[1])), 2);
-      sum.val[1] = vshlq_n_s16(
-          // vaddl_u8 is equivalent to a vpaddlq_u8 (because vld4q interleaves)
-          vreinterpretq_s16_u16(vaddl_u8(top.val[2], top.val[3])), 2);
-      vst2q_s16(pred_buf_q3, sum);
+      uint16x8x2_t sum;
+      // vaddl_u8 is equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      sum.val[0] = vshlq_n_u16(vaddl_u8(top.val[0], top.val[1]), 2);
+      sum.val[1] = vshlq_n_u16(vaddl_u8(top.val[2], top.val[3]), 2);
+      vst2q_u16(pred_buf_q3, sum);
     }
     input += input_stride;
   } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
@@ -111,28 +107,24 @@
 
 static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input,
                                               int input_stride,
-                                              int16_t *pred_buf_q3, int width,
+                                              uint16_t *pred_buf_q3, int width,
                                               int height) {
-  const int16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
   do {
     if (width == 4) {
       const uint16x8_t top = vshll_n_u8(vldh_dup_u8(input), 3);
-      vst1_s16(pred_buf_q3, vreinterpret_s16_u16(vget_low_u16(top)));
+      vst1_u16(pred_buf_q3, vget_low_u16(top));
     } else if (width == 8) {
       const uint16x8_t top = vshll_n_u8(vld1_u8(input), 3);
-      vst1q_s16(pred_buf_q3, vreinterpretq_s16_u16(top));
+      vst1q_u16(pred_buf_q3, top);
     } else {
       const uint8x16_t top = vld1q_u8(input);
-      vst1q_s16(pred_buf_q3,
-                vreinterpretq_s16_u16(vshll_n_u8(vget_low_u8(top), 3)));
-      vst1q_s16(pred_buf_q3 + 8,
-                vreinterpretq_s16_u16(vshll_n_u8(vget_high_u8(top), 3)));
+      vst1q_u16(pred_buf_q3, vshll_n_u8(vget_low_u8(top), 3));
+      vst1q_u16(pred_buf_q3 + 8, vshll_n_u8(vget_high_u8(top), 3));
       if (width == 32) {
         const uint8x16_t next_top = vld1q_u8(input + 16);
-        vst1q_s16(pred_buf_q3 + 16,
-                  vreinterpretq_s16_u16(vshll_n_u8(vget_low_u8(next_top), 3)));
-        vst1q_s16(pred_buf_q3 + 24,
-                  vreinterpretq_s16_u16(vshll_n_u8(vget_high_u8(next_top), 3)));
+        vst1q_u16(pred_buf_q3 + 16, vshll_n_u8(vget_low_u8(next_top), 3));
+        vst1q_u16(pred_buf_q3 + 24, vshll_n_u8(vget_high_u8(next_top), 3));
       }
     }
     input += input_stride;
@@ -148,9 +140,9 @@
 
 static void cfl_luma_subsampling_420_hbd_neon(const uint16_t *input,
                                               int input_stride,
-                                              int16_t *pred_buf_q3, int width,
+                                              uint16_t *pred_buf_q3, int width,
                                               int height) {
-  const int16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
   const int luma_stride = input_stride << 1;
   do {
     if (width == 4) {
@@ -158,21 +150,20 @@
       const uint16x4_t bot = vld1_u16(input + input_stride);
       const uint16x4_t sum = vadd_u16(top, bot);
       const uint16x4_t hsum = vpadd_u16(sum, sum);
-      vsth_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(hsum), 1));
+      vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 1));
     } else if (width < 32) {
       const uint16x8_t top = vld1q_u16(input);
       const uint16x8_t bot = vld1q_u16(input + input_stride);
       const uint16x8_t sum = vaddq_u16(top, bot);
       if (width == 8) {
-        const int16x4_t hsum =
-            vreinterpret_s16_u16(vget_low_u16(vpaddq_u16(sum, sum)));
-        vst1_s16(pred_buf_q3, vshl_n_s16(hsum, 1));
+        const uint16x4_t hsum = vget_low_u16(vpaddq_u16(sum, sum));
+        vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 1));
       } else {
         const uint16x8_t top_1 = vld1q_u16(input + 8);
         const uint16x8_t bot_1 = vld1q_u16(input + 8 + input_stride);
         const uint16x8_t sum_1 = vaddq_u16(top_1, bot_1);
-        const int16x8_t hsum = vreinterpretq_s16_u16(vpaddq_u16(sum, sum_1));
-        vst1q_s16(pred_buf_q3, vshlq_n_s16(hsum, 1));
+        const uint16x8_t hsum = vpaddq_u16(sum, sum_1);
+        vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 1));
       }
     } else {
       const uint16x8x4_t top = vld4q_u16(input);
@@ -185,12 +176,10 @@
       const uint16x8_t top_1 = vaddq_u16(top.val[2], top.val[3]);
       // equivalent to a vpaddq_u16 (because vld4q interleaves)
       const uint16x8_t bot_1 = vaddq_u16(bot.val[2], bot.val[3]);
-      int16x8x2_t sum;
-      sum.val[0] =
-          vshlq_n_s16(vreinterpretq_s16_u16(vaddq_u16(top_0, bot_0)), 1);
-      sum.val[1] =
-          vshlq_n_s16(vreinterpretq_s16_u16(vaddq_u16(top_1, bot_1)), 1);
-      vst2q_s16(pred_buf_q3, sum);
+      uint16x8x2_t sum;
+      sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1);
+      sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1);
+      vst2q_u16(pred_buf_q3, sum);
     }
     input += luma_stride;
   } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
@@ -198,34 +187,33 @@
 
 static void cfl_luma_subsampling_422_hbd_neon(const uint16_t *input,
                                               int input_stride,
-                                              int16_t *pred_buf_q3, int width,
+                                              uint16_t *pred_buf_q3, int width,
                                               int height) {
-  const int16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
   do {
     if (width == 4) {
       const uint16x4_t top = vld1_u16(input);
       const uint16x4_t hsum = vpadd_u16(top, top);
-      vsth_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(hsum), 2));
+      vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 2));
     } else if (width == 8) {
       const uint16x4x2_t top = vld2_u16(input);
       // equivalent to a vpadd_u16 (because vld2 interleaves)
       const uint16x4_t hsum = vadd_u16(top.val[0], top.val[1]);
-      vst1_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(hsum), 2));
+      vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 2));
     } else if (width == 16) {
       const uint16x8x2_t top = vld2q_u16(input);
       // equivalent to a vpaddq_u16 (because vld2q interleaves)
       const uint16x8_t hsum = vaddq_u16(top.val[0], top.val[1]);
-      vst1q_s16(pred_buf_q3, vshlq_n_s16(vreinterpretq_s16_u16(hsum), 2));
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 2));
     } else {
       const uint16x8x4_t top = vld4q_u16(input);
       // equivalent to a vpaddq_u16 (because vld4q interleaves)
       const uint16x8_t hsum_0 = vaddq_u16(top.val[0], top.val[1]);
       // equivalent to a vpaddq_u16 (because vld4q interleaves)
       const uint16x8_t hsum_1 = vaddq_u16(top.val[2], top.val[3]);
-      int16x8x2_t result = { { vshlq_n_s16(vreinterpretq_s16_u16(hsum_0), 2),
-                               vshlq_n_s16(vreinterpretq_s16_u16(hsum_1),
-                                           2) } };
-      vst2q_s16(pred_buf_q3, result);
+      uint16x8x2_t result = { { vshlq_n_u16(hsum_0, 2),
+                                vshlq_n_u16(hsum_1, 2) } };
+      vst2q_u16(pred_buf_q3, result);
     }
     input += input_stride;
   } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
@@ -233,30 +221,28 @@
 
 static void cfl_luma_subsampling_444_hbd_neon(const uint16_t *input,
                                               int input_stride,
-                                              int16_t *pred_buf_q3, int width,
+                                              uint16_t *pred_buf_q3, int width,
                                               int height) {
-  const int16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
   do {
     if (width == 4) {
-      const int16x4_t top = vreinterpret_s16_u16(vld1_u16(input));
-      vst1_s16(pred_buf_q3, vshl_n_s16(top, 3));
+      const uint16x4_t top = vld1_u16(input);
+      vst1_u16(pred_buf_q3, vshl_n_u16(top, 3));
     } else if (width == 8) {
-      const int16x8_t top = vreinterpretq_s16_u16(vld1q_u16(input));
-      vst1q_s16(pred_buf_q3, vshlq_n_s16(top, 3));
+      const uint16x8_t top = vld1q_u16(input);
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 3));
     } else if (width == 16) {
-      const uint16x8x2_t top = vld2q_u16(input);
-      int16x8x2_t results;
-      results.val[0] = vshlq_n_s16(vreinterpretq_s16_u16(top.val[0]), 3);
-      results.val[1] = vshlq_n_s16(vreinterpretq_s16_u16(top.val[1]), 3);
-      vst2q_s16(pred_buf_q3, results);
+      uint16x8x2_t top = vld2q_u16(input);
+      top.val[0] = vshlq_n_u16(top.val[0], 3);
+      top.val[1] = vshlq_n_u16(top.val[1], 3);
+      vst2q_u16(pred_buf_q3, top);
     } else {
-      const uint16x8x4_t top = vld4q_u16(input);
-      int16x8x4_t results;
-      results.val[0] = vshlq_n_s16(vreinterpretq_s16_u16(top.val[0]), 3);
-      results.val[1] = vshlq_n_s16(vreinterpretq_s16_u16(top.val[1]), 3);
-      results.val[2] = vshlq_n_s16(vreinterpretq_s16_u16(top.val[2]), 3);
-      results.val[3] = vshlq_n_s16(vreinterpretq_s16_u16(top.val[3]), 3);
-      vst4q_s16(pred_buf_q3, results);
+      uint16x8x4_t top = vld4q_u16(input);
+      top.val[0] = vshlq_n_u16(top.val[0], 3);
+      top.val[1] = vshlq_n_u16(top.val[1], 3);
+      top.val[2] = vshlq_n_u16(top.val[2], 3);
+      top.val[3] = vshlq_n_u16(top.val[3], 3);
+      vst4q_u16(pred_buf_q3, top);
     }
     input += input_stride;
   } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
@@ -264,12 +250,11 @@
 
 CFL_GET_SUBSAMPLE_FUNCTION(neon)
 
-static INLINE void subtract_average_neon(const int16_t *src, int16_t *dst,
+static INLINE void subtract_average_neon(const uint16_t *src, int16_t *dst,
                                          int width, int height,
                                          int round_offset,
                                          const int num_pel_log2) {
-  const int16_t *const end = src + height * CFL_BUF_LINE;
-  const uint16_t *const sum_end = (uint16_t *)end;
+  const uint16_t *const end = src + height * CFL_BUF_LINE;
 
   // Round offset is not needed, because NEON will handle the rounding.
   (void)round_offset;
@@ -281,7 +266,7 @@
   // pixels, which are positive integer and only require 15 bits. By using
   // unsigned integer for the sum, we can do one addition operation inside 16
   // bits (8 lanes) before having to convert to 32 bits (4 lanes).
-  const uint16_t *sum_buf = (uint16_t *)src;
+  const uint16_t *sum_buf = src;
   uint32x4_t sum_32x4 = { 0, 0, 0, 0 };
   do {
     // For all widths, we load, add and combine the data so it fits in 4 lanes.
@@ -322,7 +307,7 @@
       }
     }
     sum_buf += step;
-  } while (sum_buf < sum_end);
+  } while (sum_buf < end);
 
   // Permute and add in such a way that each lane contains the block sum.
   // [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A]
@@ -355,7 +340,7 @@
 
   if (width == 4) {
     do {
-      vst1_s16(dst, vsub_s16(vld1_s16(src), avg_16x4));
+      vst1_s16(dst, vsub_s16(vreinterpret_s16_u16(vld1_u16(src)), avg_16x4));
       src += CFL_BUF_LINE;
       dst += CFL_BUF_LINE;
     } while (src < end);
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 91aac2d..df5886d 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -36,12 +36,12 @@
 
 /* Function pointers return by CfL functions */
 typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
-                                     int16_t *output_q3);
+                                     uint16_t *output_q3);
 
 typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
-                                     int16_t *output_q3);
+                                     uint16_t *output_q3);
 
-typedef void (*cfl_subtract_average_fn)(const int16_t *src, int16_t *dst);
+typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
 
 typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
                                    int dst_stride, int alpha_q3);
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 17f90ba..4edcfff 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -451,7 +451,7 @@
 typedef struct cfl_ctx {
   // Q3 reconstructed luma pixels (only Q2 is required, but Q3 is used to avoid
   // shifts)
-  int16_t recon_buf_q3[CFL_BUF_SQUARE];
+  uint16_t recon_buf_q3[CFL_BUF_SQUARE];
   // Q3 AC contributions (reconstructed luma pixels - tx block avg)
   int16_t ac_buf_q3[CFL_BUF_SQUARE];
 
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index bef235c..ee19f0b 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -95,9 +95,9 @@
 
   if (diff_width > 0) {
     const int min_height = height - diff_height;
-    int16_t *recon_buf_q3 = cfl->recon_buf_q3 + (width - diff_width);
+    uint16_t *recon_buf_q3 = cfl->recon_buf_q3 + (width - diff_width);
     for (int j = 0; j < min_height; j++) {
-      const int16_t last_pixel = recon_buf_q3[-1];
+      const uint16_t last_pixel = recon_buf_q3[-1];
       assert(recon_buf_q3 + diff_width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE);
       for (int i = 0; i < diff_width; i++) {
         recon_buf_q3[i] = last_pixel;
@@ -107,10 +107,10 @@
     cfl->buf_width = width;
   }
   if (diff_height > 0) {
-    int16_t *recon_buf_q3 =
+    uint16_t *recon_buf_q3 =
         cfl->recon_buf_q3 + ((height - diff_height) * CFL_BUF_LINE);
     for (int j = 0; j < diff_height; j++) {
-      const int16_t *last_row_q3 = recon_buf_q3 - CFL_BUF_LINE;
+      const uint16_t *last_row_q3 = recon_buf_q3 - CFL_BUF_LINE;
       assert(recon_buf_q3 + width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE);
       for (int i = 0; i < width; i++) {
         recon_buf_q3[i] = last_row_q3[i];
@@ -121,10 +121,10 @@
   }
 }
 
-static void subtract_average_c(const int16_t *src, int16_t *dst, int width,
+static void subtract_average_c(const uint16_t *src, int16_t *dst, int width,
                                int height, int round_offset, int num_pel_log2) {
   int sum = round_offset;
-  const int16_t *recon = src;
+  const uint16_t *recon = src;
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
       sum += recon[i];
@@ -235,7 +235,7 @@
 
 // Null function used for invalid tx_sizes
 void cfl_subsample_lbd_null(const uint8_t *input, int input_stride,
-                            int16_t *output_q3) {
+                            uint16_t *output_q3) {
   (void)input;
   (void)input_stride;
   (void)output_q3;
@@ -244,7 +244,7 @@
 
 // Null function used for invalid tx_sizes
 void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
-                            int16_t *output_q3) {
+                            uint16_t *output_q3) {
   (void)input;
   (void)input_stride;
   (void)output_q3;
@@ -252,8 +252,9 @@
 }
 
 static void cfl_luma_subsampling_420_lbd_c(const uint8_t *input,
-                                           int input_stride, int16_t *output_q3,
-                                           int width, int height) {
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
   for (int j = 0; j < height; j += 2) {
     for (int i = 0; i < width; i += 2) {
       const int bot = i + input_stride;
@@ -266,8 +267,9 @@
 }
 
 static void cfl_luma_subsampling_422_lbd_c(const uint8_t *input,
-                                           int input_stride, int16_t *output_q3,
-                                           int width, int height) {
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
   assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i += 2) {
@@ -279,8 +281,9 @@
 }
 
 static void cfl_luma_subsampling_444_lbd_c(const uint8_t *input,
-                                           int input_stride, int16_t *output_q3,
-                                           int width, int height) {
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
   assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
@@ -292,8 +295,9 @@
 }
 
 static void cfl_luma_subsampling_420_hbd_c(const uint16_t *input,
-                                           int input_stride, int16_t *output_q3,
-                                           int width, int height) {
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
   for (int j = 0; j < height; j += 2) {
     for (int i = 0; i < width; i += 2) {
       const int bot = i + input_stride;
@@ -306,8 +310,9 @@
 }
 
 static void cfl_luma_subsampling_422_hbd_c(const uint16_t *input,
-                                           int input_stride, int16_t *output_q3,
-                                           int width, int height) {
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
   assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i += 2) {
@@ -319,8 +324,9 @@
 }
 
 static void cfl_luma_subsampling_444_hbd_c(const uint16_t *input,
-                                           int input_stride, int16_t *output_q3,
-                                           int width, int height) {
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
   assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
@@ -386,7 +392,7 @@
   assert(store_col + store_width <= CFL_BUF_LINE);
 
   // Store the input into the CfL pixel buffer
-  int16_t *recon_buf_q3 =
+  uint16_t *recon_buf_q3 =
       cfl->recon_buf_q3 + (store_row * CFL_BUF_LINE + store_col);
 
   if (use_hbd) {
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index 0f3a763..bc9fbce 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -82,11 +82,11 @@
 
 // Null function used for invalid tx_sizes
 void cfl_subsample_lbd_null(const uint8_t *input, int input_stride,
-                            int16_t *output_q3);
+                            uint16_t *output_q3);
 
 // Null function used for invalid tx_sizes
 void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
-                            int16_t *output_q3);
+                            uint16_t *output_q3);
 
 // Allows the CFL_SUBSAMPLE function to switch types depending on the bitdepth.
 #define CFL_lbd_TYPE uint8_t *cfl_type
@@ -98,7 +98,7 @@
 // goodness.
 #define CFL_SUBSAMPLE(arch, sub, bd, width, height)                       \
   void subsample_##bd##_##sub##_##width##x##height##_##arch(              \
-      const CFL_##bd##_TYPE, int input_stride, int16_t *output_q3) {      \
+      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
     cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
                                                output_q3, width, height); \
   }
@@ -161,7 +161,8 @@
   CFL_SUBSAMPLE_FUNCTIONS(arch, 444, hbd)
 
 // Null function used for invalid tx_sizes
-static INLINE void cfl_subtract_average_null(const int16_t *src, int16_t *dst) {
+static INLINE void cfl_subtract_average_null(const uint16_t *src,
+                                             int16_t *dst) {
   (void)dst;
   (void)src;
   assert(0);
@@ -171,11 +172,11 @@
 // will inline the size generic function in here, the advantage is that the size
 // will be constant allowing for loop unrolling and other constant propagated
 // goodness.
-#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2)  \
-  void subtract_average_##width##x##height##_##arch(const int16_t *src, \
-                                                    int16_t *dst) {     \
-    subtract_average_##arch(src, dst, width, height, round_offset,      \
-                            num_pel_log2);                              \
+#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2)   \
+  void subtract_average_##width##x##height##_##arch(const uint16_t *src, \
+                                                    int16_t *dst) {      \
+    subtract_average_##arch(src, dst, width, height, round_offset,       \
+                            num_pel_log2);                               \
   }
 
 // Declare size-specific wrappers for all valid CfL sizes.
@@ -223,9 +224,9 @@
 
 // For VSX SIMD optimization, the C versions of width == 4 subtract are
 // faster than the VSX. As such, the VSX code calls the C versions.
-void subtract_average_4x4_c(const int16_t *src, int16_t *dst);
-void subtract_average_4x8_c(const int16_t *src, int16_t *dst);
-void subtract_average_4x16_c(const int16_t *src, int16_t *dst);
+void subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
+void subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
+void subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
 
 #define CFL_PREDICT_lbd(arch, width, height)                                 \
   void predict_lbd_##width##x##height##_##arch(const int16_t *pred_buf_q3,   \
diff --git a/av1/common/x86/cfl_avx2.c b/av1/common/x86/cfl_avx2.c
index a315921..a8bfdcc 100644
--- a/av1/common/x86/cfl_avx2.c
+++ b/av1/common/x86/cfl_avx2.c
@@ -60,7 +60,7 @@
  */
 static void cfl_luma_subsampling_420_lbd_avx2(const uint8_t *input,
                                               int input_stride,
-                                              int16_t *pred_buf_q3, int width,
+                                              uint16_t *pred_buf_q3, int width,
                                               int height) {
   (void)width;                               // Forever 32
   const __m256i twos = _mm256_set1_epi8(2);  // Thirty two twos
@@ -95,7 +95,7 @@
  */
 static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input,
                                               int input_stride,
-                                              int16_t *pred_buf_q3, int width,
+                                              uint16_t *pred_buf_q3, int width,
                                               int height) {
   (void)width;                                // Forever 32
   const __m256i fours = _mm256_set1_epi8(4);  // Thirty two fours
@@ -123,7 +123,7 @@
  */
 static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input,
                                               int input_stride,
-                                              int16_t *pred_buf_q3, int width,
+                                              uint16_t *pred_buf_q3, int width,
                                               int height) {
   (void)width;  // Forever 32
   __m256i *row = (__m256i *)pred_buf_q3;
@@ -161,7 +161,7 @@
  */
 static void cfl_luma_subsampling_420_hbd_avx2(const uint16_t *input,
                                               int input_stride,
-                                              int16_t *pred_buf_q3, int width,
+                                              uint16_t *pred_buf_q3, int width,
                                               int height) {
   (void)width;  // Forever 32
   const int luma_stride = input_stride << 1;
@@ -201,7 +201,7 @@
  */
 static void cfl_luma_subsampling_422_hbd_avx2(const uint16_t *input,
                                               int input_stride,
-                                              int16_t *pred_buf_q3, int width,
+                                              uint16_t *pred_buf_q3, int width,
                                               int height) {
   (void)width;  // Forever 32
   __m256i *row = (__m256i *)pred_buf_q3;
@@ -223,7 +223,7 @@
 
 static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input,
                                               int input_stride,
-                                              int16_t *pred_buf_q3, int width,
+                                              uint16_t *pred_buf_q3, int width,
                                               int height) {
   (void)width;  // Forever 32
   __m256i *row = (__m256i *)pred_buf_q3;
@@ -395,7 +395,7 @@
                           _mm256_unpackhi_epi16(a, _mm256_setzero_si256()));
 }
 
-static INLINE void subtract_average_avx2(const int16_t *src_ptr,
+static INLINE void subtract_average_avx2(const uint16_t *src_ptr,
                                          int16_t *dst_ptr, int width,
                                          int height, int round_offset,
                                          int num_pel_log2) {
diff --git a/av1/common/x86/cfl_simd.h b/av1/common/x86/cfl_simd.h
index 6401d0a..7479ac3 100644
--- a/av1/common/x86/cfl_simd.h
+++ b/av1/common/x86/cfl_simd.h
@@ -13,179 +13,179 @@
 
 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
 void subsample_lbd_420_4x4_ssse3(const uint8_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_lbd_420_4x8_ssse3(const uint8_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_lbd_420_4x16_ssse3(const uint8_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
 void subsample_lbd_420_8x4_ssse3(const uint8_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_lbd_420_8x8_ssse3(const uint8_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_lbd_420_8x16_ssse3(const uint8_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 void subsample_lbd_420_8x32_ssse3(const uint8_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 16, we reuse it in AVX2
 void subsample_lbd_420_16x4_ssse3(const uint8_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 void subsample_lbd_420_16x8_ssse3(const uint8_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 void subsample_lbd_420_16x16_ssse3(const uint8_t *input, int input_stride,
-                                   int16_t *output_q3);
+                                   uint16_t *output_q3);
 void subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride,
-                                   int16_t *output_q3);
+                                   uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
 void subsample_lbd_422_4x4_ssse3(const uint8_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_lbd_422_4x8_ssse3(const uint8_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_lbd_422_4x16_ssse3(const uint8_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
 void subsample_lbd_422_8x4_ssse3(const uint8_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_lbd_422_8x8_ssse3(const uint8_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_lbd_422_8x16_ssse3(const uint8_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 void subsample_lbd_422_8x32_ssse3(const uint8_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 16, we reuse it in AVX2
 void subsample_lbd_422_16x4_ssse3(const uint8_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 void subsample_lbd_422_16x8_ssse3(const uint8_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 void subsample_lbd_422_16x16_ssse3(const uint8_t *input, int input_stride,
-                                   int16_t *output_q3);
+                                   uint16_t *output_q3);
 void subsample_lbd_422_16x32_ssse3(const uint8_t *input, int input_stride,
-                                   int16_t *output_q3);
+                                   uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
 void subsample_lbd_444_4x4_ssse3(const uint8_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_lbd_444_4x8_ssse3(const uint8_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_lbd_444_4x16_ssse3(const uint8_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
 void subsample_lbd_444_8x4_ssse3(const uint8_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_lbd_444_8x8_ssse3(const uint8_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_lbd_444_8x16_ssse3(const uint8_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 void subsample_lbd_444_8x32_ssse3(const uint8_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 16, we reuse it in AVX2
 void subsample_lbd_444_16x4_ssse3(const uint8_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 void subsample_lbd_444_16x8_ssse3(const uint8_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 void subsample_lbd_444_16x16_ssse3(const uint8_t *input, int input_stride,
-                                   int16_t *output_q3);
+                                   uint16_t *output_q3);
 void subsample_lbd_444_16x32_ssse3(const uint8_t *input, int input_stride,
-                                   int16_t *output_q3);
+                                   uint16_t *output_q3);
 
 void subsample_hbd_420_4x4_ssse3(const uint16_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_hbd_420_4x8_ssse3(const uint16_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_hbd_420_4x16_ssse3(const uint16_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
 void subsample_hbd_420_8x4_ssse3(const uint16_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_hbd_420_8x8_ssse3(const uint16_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_hbd_420_8x16_ssse3(const uint16_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 void subsample_hbd_420_8x32_ssse3(const uint16_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 
 // SSSE3 version is faster for with == 16, we reuse it in AVX2
 void subsample_hbd_420_16x4_ssse3(const uint16_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 void subsample_hbd_420_16x8_ssse3(const uint16_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 void subsample_hbd_420_16x16_ssse3(const uint16_t *input, int input_stride,
-                                   int16_t *output_q3);
+                                   uint16_t *output_q3);
 void subsample_hbd_420_16x32_ssse3(const uint16_t *input, int input_stride,
-                                   int16_t *output_q3);
+                                   uint16_t *output_q3);
 
 void subsample_hbd_422_4x4_ssse3(const uint16_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_hbd_422_4x8_ssse3(const uint16_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_hbd_422_4x16_ssse3(const uint16_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
 void subsample_hbd_422_8x4_ssse3(const uint16_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_hbd_422_8x8_ssse3(const uint16_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_hbd_422_8x16_ssse3(const uint16_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 void subsample_hbd_422_8x32_ssse3(const uint16_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 
 // SSSE3 version is faster for with == 16, we reuse it in AVX2
 void subsample_hbd_422_16x4_ssse3(const uint16_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 void subsample_hbd_422_16x8_ssse3(const uint16_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 void subsample_hbd_422_16x16_ssse3(const uint16_t *input, int input_stride,
-                                   int16_t *output_q3);
+                                   uint16_t *output_q3);
 void subsample_hbd_422_16x32_ssse3(const uint16_t *input, int input_stride,
-                                   int16_t *output_q3);
+                                   uint16_t *output_q3);
 
 void subsample_hbd_444_4x4_ssse3(const uint16_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_hbd_444_4x8_ssse3(const uint16_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_hbd_444_4x16_ssse3(const uint16_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 
 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
 void subsample_hbd_444_8x4_ssse3(const uint16_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_hbd_444_8x8_ssse3(const uint16_t *input, int input_stride,
-                                 int16_t *output_q3);
+                                 uint16_t *output_q3);
 void subsample_hbd_444_8x16_ssse3(const uint16_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 void subsample_hbd_444_8x32_ssse3(const uint16_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 
 // SSSE3 version is faster for with == 16, we reuse it in AVX2
 void subsample_hbd_444_16x4_ssse3(const uint16_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 void subsample_hbd_444_16x8_ssse3(const uint16_t *input, int input_stride,
-                                  int16_t *output_q3);
+                                  uint16_t *output_q3);
 void subsample_hbd_444_16x16_ssse3(const uint16_t *input, int input_stride,
-                                   int16_t *output_q3);
+                                   uint16_t *output_q3);
 void subsample_hbd_444_16x32_ssse3(const uint16_t *input, int input_stride,
-                                   int16_t *output_q3);
+                                   uint16_t *output_q3);
 
 // SSE2 version is optimal for with == 4, we reuse them in AVX2
-void subtract_average_4x4_sse2(const int16_t *src, int16_t *dst);
-void subtract_average_4x8_sse2(const int16_t *src, int16_t *dst);
-void subtract_average_4x16_sse2(const int16_t *src, int16_t *dst);
+void subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst);
 
 // SSE2 version is optimal for with == 8, we reuse them in AVX2
-void subtract_average_8x4_sse2(const int16_t *src, int16_t *dst);
-void subtract_average_8x8_sse2(const int16_t *src, int16_t *dst);
-void subtract_average_8x16_sse2(const int16_t *src, int16_t *dst);
-void subtract_average_8x32_sse2(const int16_t *src, int16_t *dst);
+void subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst);
 
 void predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
                            int dst_stride, int alpha_q3);
diff --git a/av1/common/x86/cfl_sse2.c b/av1/common/x86/cfl_sse2.c
index 18a6c07..4783fe0 100644
--- a/av1/common/x86/cfl_sse2.c
+++ b/av1/common/x86/cfl_sse2.c
@@ -19,7 +19,7 @@
   return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1)));
 }
 
-static INLINE void subtract_average_sse2(const int16_t *src_ptr,
+static INLINE void subtract_average_sse2(const uint16_t *src_ptr,
                                          int16_t *dst_ptr, int width,
                                          int height, int round_offset,
                                          int num_pel_log2) {
diff --git a/av1/common/x86/cfl_ssse3.c b/av1/common/x86/cfl_ssse3.c
index bbff892..bbf0072 100644
--- a/av1/common/x86/cfl_ssse3.c
+++ b/av1/common/x86/cfl_ssse3.c
@@ -39,7 +39,7 @@
  */
 static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input,
                                                       int input_stride,
-                                                      int16_t *pred_buf_q3,
+                                                      uint16_t *pred_buf_q3,
                                                       int width, int height) {
   const __m128i twos = _mm_set1_epi8(2);
   __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
@@ -94,7 +94,7 @@
  */
 static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input,
                                                       int input_stride,
-                                                      int16_t *pred_buf_q3,
+                                                      uint16_t *pred_buf_q3,
                                                       int width, int height) {
   const __m128i fours = _mm_set1_epi8(4);
   __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
@@ -134,7 +134,7 @@
  */
 static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input,
                                                       int input_stride,
-                                                      int16_t *pred_buf_q3,
+                                                      uint16_t *pred_buf_q3,
                                                       int width, int height) {
   const __m128i zeros = _mm_setzero_si128();
   const int luma_stride = input_stride;
@@ -180,9 +180,9 @@
  */
 static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input,
                                                       int input_stride,
-                                                      int16_t *pred_buf_q3,
+                                                      uint16_t *pred_buf_q3,
                                                       int width, int height) {
-  const int16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
   const int luma_stride = input_stride << 1;
   do {
     if (width == 4) {
@@ -235,7 +235,7 @@
  */
 static INLINE void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input,
                                                       int input_stride,
-                                                      int16_t *pred_buf_q3,
+                                                      uint16_t *pred_buf_q3,
                                                       int width, int height) {
   __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
   const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
@@ -268,9 +268,9 @@
 
 static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input,
                                                       int input_stride,
-                                                      int16_t *pred_buf_q3,
+                                                      uint16_t *pred_buf_q3,
                                                       int width, int height) {
-  const int16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
   do {
     if (width == 4) {
       const __m128i row = _mm_slli_epi16(_mm_loadl_epi64((__m128i *)input), 3);
diff --git a/test/cfl_test.cc b/test/cfl_test.cc
index 27b4e86..e4d438d 100644
--- a/test/cfl_test.cc
+++ b/test/cfl_test.cc
@@ -178,8 +178,8 @@
 TEST_P(CFLSubAvgTest, SubAvgTest) {
   for (int it = 0; it < NUM_ITERATIONS; it++) {
     randData(&ACMRandom::Rand15Signed);
-    sub_avg(data, data);
-    sub_avg_ref(data_ref, data_ref);
+    sub_avg((uint16_t *)data, data);
+    sub_avg_ref((uint16_t *)data_ref, data_ref);
     assert_eq<int16_t>(data, data_ref, width, height);
   }
 }
@@ -190,13 +190,13 @@
   randData(&ACMRandom::Rand15Signed);
   aom_usec_timer_start(&ref_timer);
   for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
-    sub_avg_ref(data_ref, data_ref);
+    sub_avg_ref((uint16_t *)data_ref, data_ref);
   }
   aom_usec_timer_mark(&ref_timer);
   int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
   aom_usec_timer_start(&timer);
   for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
-    sub_avg(data, data);
+    sub_avg((uint16_t *)data, data);
   }
   aom_usec_timer_mark(&timer);
   int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
@@ -225,21 +225,21 @@
 
   void subsampleTest(T fun, T fun_ref, int sub_width, int sub_height,
                      I (ACMRandom::*random)()) {
-    int16_t sub_luma_pels[CFL_BUF_SQUARE];
-    int16_t sub_luma_pels_ref[CFL_BUF_SQUARE];
+    uint16_t sub_luma_pels[CFL_BUF_SQUARE];
+    uint16_t sub_luma_pels_ref[CFL_BUF_SQUARE];
 
     for (int it = 0; it < NUM_ITERATIONS; it++) {
       CFLTestWithData<I>::randData(random);
       fun(this->data, CFL_BUF_LINE, sub_luma_pels);
       fun_ref(this->data_ref, CFL_BUF_LINE, sub_luma_pels_ref);
-      assert_eq<int16_t>(sub_luma_pels, sub_luma_pels_ref, sub_width,
-                         sub_height);
+      assert_eq<uint16_t>(sub_luma_pels, sub_luma_pels_ref, sub_width,
+                          sub_height);
     }
   }
 
   void subsampleSpeedTest(T fun, T fun_ref, I (ACMRandom::*random)()) {
-    int16_t sub_luma_pels[CFL_BUF_SQUARE];
-    int16_t sub_luma_pels_ref[CFL_BUF_SQUARE];
+    uint16_t sub_luma_pels[CFL_BUF_SQUARE];
+    uint16_t sub_luma_pels_ref[CFL_BUF_SQUARE];
     aom_usec_timer ref_timer;
     aom_usec_timer timer;