cfl: add missing prototypes

fixes -Wmissing-prototypes warning

Bug: aomedia:3416
Change-Id: I7661fce3f4c2ccaaef59f144030079dd5c126a13
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index 0e37d45..bd11c4a 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -159,8 +159,9 @@
 CFL_PREDICT_FN(c, lbd)
 
 #if CONFIG_AV1_HIGHBITDEPTH
-void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride,
-                       int alpha_q3, int bit_depth, int width, int height) {
+static INLINE void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst,
+                                     int dst_stride, int alpha_q3,
+                                     int bit_depth, int width, int height) {
   for (int j = 0; j < height; j++) {
     for (int i = 0; i < width; i++) {
       dst[i] = clip_pixel_highbd(
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index dcaa87b..dbb94d6 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -96,6 +96,8 @@
 // goodness.
 #define CFL_SUBSAMPLE(arch, sub, bd, width, height)                       \
   void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch(          \
+      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3);      \
+  void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch(          \
       const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
     cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
                                                output_q3, width, height); \
@@ -171,6 +173,8 @@
 // goodness.
 #define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2)       \
   void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \
+                                                        int16_t *dst);       \
+  void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \
                                                         int16_t *dst) {      \
     subtract_average_##arch(src, dst, width, height, round_offset,           \
                             num_pel_log2);                                   \
@@ -220,24 +224,23 @@
     return sub_avg[tx_size % TX_SIZES_ALL];                               \
   }
 
-// For VSX SIMD optimization, the C versions of width == 4 subtract are
-// faster than the VSX. As such, the VSX code calls the C versions.
-void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
-void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
-void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
-
-#define CFL_PREDICT_lbd(arch, width, height)                              \
-  void cfl_predict_lbd_##width##x##height##_##arch(                       \
-      const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride,           \
-      int alpha_q3) {                                                     \
-    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \
-                           height);                                       \
+#define CFL_PREDICT_lbd(arch, width, height)                                   \
+  void cfl_predict_lbd_##width##x##height##_##arch(                            \
+      const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3); \
+  void cfl_predict_lbd_##width##x##height##_##arch(                            \
+      const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride,                \
+      int alpha_q3) {                                                          \
+    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
+                           height);                                            \
   }
 
 #if CONFIG_AV1_HIGHBITDEPTH
 #define CFL_PREDICT_hbd(arch, width, height)                                   \
   void cfl_predict_hbd_##width##x##height##_##arch(                            \
       const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \
+      int bd);                                                                 \
+  void cfl_predict_hbd_##width##x##height##_##arch(                            \
+      const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \
       int bd) {                                                                \
     cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
                            height);                                            \
diff --git a/av1/common/ppc/cfl_ppc.c b/av1/common/ppc/cfl_ppc.c
index 6f88768..27a7f07 100644
--- a/av1/common/ppc/cfl_ppc.c
+++ b/av1/common/ppc/cfl_ppc.c
@@ -124,6 +124,10 @@
 
 // Based on observation, for small blocks VSX does not outperform C (no 64bit
 // load and store intrinsics). So we call the C code for block widths 4.
+extern void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
+extern void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
+extern void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
+
 cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) {
   static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
     cfl_subtract_average_4x4_c,     /* 4x4 */