[CFL] Use seperate buffers for reconstructed pixels and AC

This avoid the rare situation where the average is subtracted twice.

Results on Subset1:
   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.0247 | -0.0212 | -0.0183 |   0.0077 | -0.0491 | -0.0565 |     0.0109

https://arewecompressedyet.com/?job=master%402018-05-10&job=2buf_cfl%402018-05-10T14%3A02%3A12.666Z

BUG=aomedia:1868

Change-Id: I592fbfb42490cc2ee4046a3dbe853f8eca7b91af
diff --git a/av1/common/arm/cfl_neon.c b/av1/common/arm/cfl_neon.c
index d550c68..2b407fc 100644
--- a/av1/common/arm/cfl_neon.c
+++ b/av1/common/arm/cfl_neon.c
@@ -14,8 +14,9 @@
 
 #include "av1/common/cfl.h"
 
-static INLINE void vldsubstq_s16(int16_t *buf, int16x8_t sub) {
-  vst1q_s16(buf, vsubq_s16(vld1q_s16(buf), sub));
+static INLINE void vldsubstq_s16(int16_t *dst, const int16_t *src, int offset,
+                                 int16x8_t sub) {
+  vst1q_s16(dst + offset, vsubq_s16(vld1q_s16(src + offset), sub));
 }
 
 static INLINE uint16x8_t vldaddq_u16(const uint16_t *buf, size_t offset) {
@@ -263,10 +264,11 @@
 
 CFL_GET_SUBSAMPLE_FUNCTION(neon)
 
-static INLINE void subtract_average_neon(int16_t *pred_buf, int width,
-                                         int height, int round_offset,
+static INLINE void subtract_average_neon(const int16_t *src, int16_t *dst,
+                                         int width, int height,
+                                         int round_offset,
                                          const int num_pel_log2) {
-  const int16_t *const end = pred_buf + height * CFL_BUF_LINE;
+  const int16_t *const end = src + height * CFL_BUF_LINE;
   const uint16_t *const sum_end = (uint16_t *)end;
 
   // Round offset is not needed, because NEON will handle the rounding.
@@ -279,7 +281,7 @@
   // pixels, which are positive integer and only require 15 bits. By using
   // unsigned integer for the sum, we can do one addition operation inside 16
   // bits (8 lanes) before having to convert to 32 bits (4 lanes).
-  const uint16_t *sum_buf = (uint16_t *)pred_buf;
+  const uint16_t *sum_buf = (uint16_t *)src;
   uint32x4_t sum_32x4 = { 0, 0, 0, 0 };
   do {
     // For all widths, we load, add and combine the data so it fits in 4 lanes.
@@ -319,7 +321,8 @@
         sum_32x4 = vpadalq_u16(sum_32x4, row3_1);
       }
     }
-  } while ((sum_buf += step) < sum_end);
+    sum_buf += step;
+  } while (sum_buf < sum_end);
 
   // Permute and add in such a way that each lane contains the block sum.
   // [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A]
@@ -352,33 +355,37 @@
 
   if (width == 4) {
     do {
-      vst1_s16(pred_buf, vsub_s16(vld1_s16(pred_buf), avg_16x4));
-    } while ((pred_buf += CFL_BUF_LINE) < end);
+      vst1_s16(dst, vsub_s16(vld1_s16(src), avg_16x4));
+      src += CFL_BUF_LINE;
+      dst += CFL_BUF_LINE;
+    } while (src < end);
   } else {
     const int16x8_t avg_16x8 = vcombine_s16(avg_16x4, avg_16x4);
     do {
-      vldsubstq_s16(pred_buf, avg_16x8);
-      vldsubstq_s16(pred_buf + CFL_BUF_LINE, avg_16x8);
-      vldsubstq_s16(pred_buf + 2 * CFL_BUF_LINE, avg_16x8);
-      vldsubstq_s16(pred_buf + 3 * CFL_BUF_LINE, avg_16x8);
+      vldsubstq_s16(dst, src, 0, avg_16x8);
+      vldsubstq_s16(dst, src, CFL_BUF_LINE, avg_16x8);
+      vldsubstq_s16(dst, src, 2 * CFL_BUF_LINE, avg_16x8);
+      vldsubstq_s16(dst, src, 3 * CFL_BUF_LINE, avg_16x8);
 
       if (width > 8) {
-        vldsubstq_s16(pred_buf + 8, avg_16x8);
-        vldsubstq_s16(pred_buf + CFL_BUF_LINE + 8, avg_16x8);
-        vldsubstq_s16(pred_buf + 2 * CFL_BUF_LINE + 8, avg_16x8);
-        vldsubstq_s16(pred_buf + 3 * CFL_BUF_LINE + 8, avg_16x8);
+        vldsubstq_s16(dst, src, 8, avg_16x8);
+        vldsubstq_s16(dst, src, 8 + CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 8 + 2 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 8 + 3 * CFL_BUF_LINE, avg_16x8);
       }
       if (width == 32) {
-        vldsubstq_s16(pred_buf + 16, avg_16x8);
-        vldsubstq_s16(pred_buf + 24, avg_16x8);
-        vldsubstq_s16(pred_buf + CFL_BUF_LINE + 16, avg_16x8);
-        vldsubstq_s16(pred_buf + CFL_BUF_LINE + 24, avg_16x8);
-        vldsubstq_s16(pred_buf + 2 * CFL_BUF_LINE + 16, avg_16x8);
-        vldsubstq_s16(pred_buf + 2 * CFL_BUF_LINE + 24, avg_16x8);
-        vldsubstq_s16(pred_buf + 3 * CFL_BUF_LINE + 16, avg_16x8);
-        vldsubstq_s16(pred_buf + 3 * CFL_BUF_LINE + 24, avg_16x8);
+        vldsubstq_s16(dst, src, 16, avg_16x8);
+        vldsubstq_s16(dst, src, 16 + CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 16 + 2 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 16 + 3 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 24, avg_16x8);
+        vldsubstq_s16(dst, src, 24 + CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 24 + 2 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 24 + 3 * CFL_BUF_LINE, avg_16x8);
       }
-    } while ((pred_buf += step) < end);
+      src += step;
+      dst += step;
+    } while (src < end);
   }
 }