Reduce memory footprint for CLPF decoding.

Instead of having CLPF write to an entire new frame and
copy the result back into the original frame, make the
filter able to work in-place by keeping a buffer of size
frame_width*filter_block_size and delay the write-back
by one filter_block_size row.

This reduces the cycles spent in the filter to ~75%.

Change-Id: I78ca74380c45492daa8935d08d766851edb5fbc1
diff --git a/aom_dsp/ b/aom_dsp/
index a2b9a75..5f7384b 100644
--- a/aom_dsp/
+++ b/aom_dsp/
@@ -587,7 +587,7 @@
 specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
 if (aom_config("CONFIG_CLPF") eq "yes") {
-  add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int stride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
+  add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
   specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
   add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength";
   specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/;
diff --git a/av1/common/clpf.c b/av1/common/clpf.c
index 799af01..1ca60e0 100644
--- a/av1/common/clpf.c
+++ b/av1/common/clpf.c
@@ -27,30 +27,30 @@
   return (8 + delta - (delta < 0)) >> 4;
-void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int stride, int x0,
-                      int y0, int sizex, int sizey, int width, int height,
-                      unsigned int strength) {
+void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
+                      int dstride, int x0, int y0, int sizex, int sizey,
+                      int width, int height, unsigned int strength) {
   int x, y;
   for (y = y0; y < y0 + sizey; y++) {
     for (x = x0; x < x0 + sizex; x++) {
-      int X = src[y * stride + x];
-      int A = src[AOMMAX(0, y - 1) * stride + x];
-      int B = src[y * stride + AOMMAX(0, x - 2)];
-      int C = src[y * stride + AOMMAX(0, x - 1)];
-      int D = src[y * stride + AOMMIN(width - 1, x + 1)];
-      int E = src[y * stride + AOMMIN(width - 1, x + 2)];
-      int F = src[AOMMIN(height - 1, y + 1) * stride + x];
+      int X = src[y * sstride + x];
+      int A = src[AOMMAX(0, y - 1) * sstride + x];
+      int B = src[y * sstride + AOMMAX(0, x - 2)];
+      int C = src[y * sstride + AOMMAX(0, x - 1)];
+      int D = src[y * sstride + AOMMIN(width - 1, x + 1)];
+      int E = src[y * sstride + AOMMIN(width - 1, x + 2)];
+      int F = src[AOMMIN(height - 1, y + 1) * sstride + x];
       int delta;
       delta = av1_clpf_sample(X, A, B, C, D, E, F, strength);
-      dst[y * stride + x] = X + delta;
+      dst[y * dstride + x] = X + delta;
 // Return number of filtered blocks
-int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
-                   const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
-                   int enable_fb_flag, unsigned int strength,
+int av1_clpf_frame(const YV12_BUFFER_CONFIG *orig_dst,
+                   const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org,
+                   AV1_COMMON *cm, int enable_fb_flag, unsigned int strength,
                    unsigned int fb_size_log2, uint8_t *blocks,
                    int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
                                    const YV12_BUFFER_CONFIG *,
@@ -59,23 +59,45 @@
   /* Constrained low-pass filter (CLPF) */
   int c, k, l, m, n;
   const int bs = MI_SIZE;
-  int width = rec->y_crop_width;
-  int height = rec->y_crop_height;
+  const int width = rec->y_crop_width;
+  const int height = rec->y_crop_height;
   int xpos, ypos;
-  int stride_y = rec->y_stride;
-  int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
-  int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
+  const int sstride = rec->y_stride;
+  int dstride = orig_dst->y_stride;
+  const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
+  const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
   int block_index = 0;
+  uint8_t *cache = NULL;
+  uint8_t **cache_ptr = NULL;
+  uint8_t **cache_dst = NULL;
+  int cache_idx = 0;
+  const int cache_size = num_fb_hor << (2 * fb_size_log2);
+  const int cache_blocks = cache_size / (bs * bs);
+  YV12_BUFFER_CONFIG dst = *orig_dst;
+  // Make buffer space for in-place filtering
+  if (rec->y_buffer == dst.y_buffer) {
+    CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size));
+    CHECK_MEM_ERROR(cm, cache_ptr,
+                    aom_malloc(cache_blocks * sizeof(*cache_ptr)));
+    CHECK_MEM_ERROR(cm, cache_dst,
+                    aom_malloc(cache_blocks * sizeof(*cache_dst)));
+    memset(cache_ptr, 0, cache_blocks * sizeof(*cache_dst));
+    dst.y_buffer = cache;
+    dstride = bs;
+  }
   // Iterate over all filter blocks
   for (k = 0; k < num_fb_ver; k++) {
     for (l = 0; l < num_fb_hor; l++) {
       int h, w;
       int allskip = 1;
+      const int xoff = l << fb_size_log2;
+      const int yoff = k << fb_size_log2;
       for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) {
         for (n = 0; allskip && n < (1 << fb_size_log2) / bs; n++) {
-          xpos = (l << fb_size_log2) + n * bs;
-          ypos = (k << fb_size_log2) + m * bs;
+          xpos = xoff + n * bs;
+          ypos = yoff + m * bs;
           if (xpos < width && ypos < height) {
             allskip &=
                 cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
@@ -96,31 +118,57 @@
         // Iterate over all smaller blocks inside the filter block
         for (m = 0; m < (h + bs - 1) / bs; m++) {
           for (n = 0; n < (w + bs - 1) / bs; n++) {
-            xpos = (l << fb_size_log2) + n * bs;
-            ypos = (k << fb_size_log2) + m * bs;
+            xpos = xoff + n * bs;
+            ypos = yoff + m * bs;
             if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
-                     ->mbmi.skip) {
-              // Not skip block, apply the filter
-              aom_clpf_block(rec->y_buffer, dst->y_buffer, stride_y, xpos, ypos,
-                             bs, bs, width, height, strength);
+                     ->mbmi.skip) {  // Not skip block
+              // Temporary buffering needed if filtering in-place
+              if (cache) {
+                if (cache_ptr[cache_idx]) {
+                  // Copy filtered block back into the frame
+                  for (c = 0; c < bs; c++)
+                    *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
+                        *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
+                }
+                cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
+                dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
+                cache_dst[cache_idx] = rec->y_buffer + ypos * sstride + xpos;
+                if (++cache_idx >= cache_blocks) cache_idx = 0;
+              }
+              // Apply the filter
+              aom_clpf_block(rec->y_buffer, dst.y_buffer, sstride, dstride,
+                             xpos, ypos, bs, bs, width, height, strength);
             } else {  // Skip block, copy instead
-              for (c = 0; c < bs; c++)
-                *(uint64_t *)(dst->y_buffer + (ypos + c) * stride_y + xpos) =
-                    *(uint64_t *)(rec->y_buffer + (ypos + c) * stride_y + xpos);
+              if (!cache)
+                for (c = 0; c < bs; c++)
+                  *(uint64_t *)(dst.y_buffer + (ypos + c) * dstride + xpos) = *(
+                      uint64_t *)(rec->y_buffer + (ypos + c) * sstride + xpos);
       } else {  // Entire filter block is skip, copy
-        for (m = 0; m < h; m++)
-          memcpy(dst->y_buffer + ((k << fb_size_log2) + m) * stride_y +
-                     (l << fb_size_log2),
-                 rec->y_buffer + ((k << fb_size_log2) + m) * stride_y +
-                     (l << fb_size_log2),
-                 w);
+        if (!cache)
+          for (m = 0; m < h; m++)
+            memcpy(dst.y_buffer + (yoff + m) * dstride + xoff,
+                   rec->y_buffer + (yoff + m) * sstride + xoff, w);
       block_index += !allskip;  // Count number of blocks filtered
+  if (cache) {
+    // Copy remaining blocks into the frame
+    for (cache_idx = 0; cache_idx < cache_blocks && cache_ptr[cache_idx];
+         cache_idx++)
+      for (c = 0; c < bs; c++)
+        *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
+            *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
+    aom_free(cache);
+    aom_free(cache_ptr);
+  }
   return block_index;
diff --git a/av1/common/clpf.h b/av1/common/clpf.h
index 21671a1..2fb12d6 100644
--- a/av1/common/clpf.h
+++ b/av1/common/clpf.h
@@ -18,7 +18,7 @@
 int av1_clpf_maxbits(const AV1_COMMON *cm);
 int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
 int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
-                   const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
+                   const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
                    int enable_fb_flag, unsigned int strength,
                    unsigned int fb_size_log2, uint8_t *blocks,
                    int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
diff --git a/av1/common/clpf_simd.h b/av1/common/clpf_simd.h
index 0df6cd7..544aa36 100644
--- a/av1/common/clpf_simd.h
+++ b/av1/common/clpf_simd.h
@@ -11,11 +11,11 @@
 #include "./aom_dsp_rtcd.h"
-static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
-                       int y0, int sizey, int width, int height,
-                       unsigned int strength) {
-  dst += x0 + y0 * stride;
-  src += x0 + y0 * stride;
+static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride,
+                       int dstride, int x0, int y0, int sizey, int width,
+                       int height, unsigned int strength) {
+  dst += x0 + y0 * dstride;
+  src += x0 + y0 * sstride;
     int bottom = height - 2 - y0;
     const v128 sp = v128_dup_8(strength);
@@ -32,23 +32,23 @@
       for (y = 0; y < sizey; y += 2) {
         const v64 l1 = v64_load_aligned(src);
-        const v64 l2 = v64_load_aligned(src + stride);
+        const v64 l2 = v64_load_aligned(src + sstride);
         v128 o = v128_from_v64(l1, l2);
         const v128 x = v128_add_8(c128, o);
         const v128 a = v128_add_8(
-            v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
+            v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
         const v128 b = v128_shuffle_8(x, b_shuff);
         const v128 c = v128_shuffle_8(x, c_shuff);
         const v128 d = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src + 1),
-                                v64_load_unaligned(src + 1 + stride)));
+                                v64_load_unaligned(src + 1 + sstride)));
         const v128 e = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src + 2),
-                                v64_load_unaligned(src + 2 + stride)));
+                                v64_load_unaligned(src + 2 + sstride)));
         const v128 f = v128_add_8(
             c128, v128_from_v64(l2, v64_load_aligned(
-                                        src + ((y != bottom) + 1) * stride)));
+                                        src + ((y != bottom) + 1) * sstride)));
         const v128 tmp =
             v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
@@ -70,9 +70,9 @@
                                                         delta, v128_zero()))),
         v64_store_aligned(dst, v128_high_v64(o));
-        v64_store_aligned(dst + stride, v128_low_v64(o));
-        src += stride * 2;
-        dst += stride * 2;
+        v64_store_aligned(dst + dstride, v128_low_v64(o));
+        src += sstride * 2;
+        dst += dstride * 2;
     } else if (!(width - x0 - 8)) {  // Clip right
       const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
@@ -83,23 +83,23 @@
       for (y = 0; y < sizey; y += 2) {
         const v64 l1 = v64_load_aligned(src);
-        const v64 l2 = v64_load_aligned(src + stride);
+        const v64 l2 = v64_load_aligned(src + sstride);
         v128 o = v128_from_v64(l1, l2);
         const v128 x = v128_add_8(c128, o);
         const v128 a = v128_add_8(
-            v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
+            v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
         const v128 b = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src - 2),
-                                v64_load_unaligned(src - 2 + stride)));
+                                v64_load_unaligned(src - 2 + sstride)));
         const v128 c = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src - 1),
-                                v64_load_unaligned(src - 1 + stride)));
+                                v64_load_unaligned(src - 1 + sstride)));
         const v128 d = v128_shuffle_8(x, d_shuff);
         const v128 e = v128_shuffle_8(x, e_shuff);
         const v128 f = v128_add_8(
             c128, v128_from_v64(l2, v64_load_aligned(
-                                        src + ((y != bottom) + 1) * stride)));
+                                        src + ((y != bottom) + 1) * sstride)));
         const v128 tmp =
             v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
@@ -121,35 +121,35 @@
                                                         delta, v128_zero()))),
         v64_store_aligned(dst, v128_high_v64(o));
-        v64_store_aligned(dst + stride, v128_low_v64(o));
-        src += stride * 2;
-        dst += stride * 2;
+        v64_store_aligned(dst + dstride, v128_low_v64(o));
+        src += sstride * 2;
+        dst += dstride * 2;
     } else {  // No left/right clipping
       int y;
       for (y = 0; y < sizey; y += 2) {
         const v64 l1 = v64_load_aligned(src);
-        const v64 l2 = v64_load_aligned(src + stride);
+        const v64 l2 = v64_load_aligned(src + sstride);
         v128 o = v128_from_v64(l1, l2);
         const v128 x = v128_add_8(c128, o);
         const v128 a = v128_add_8(
-            v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
+            v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
         const v128 b = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src - 2),
-                                v64_load_unaligned(src - 2 + stride)));
+                                v64_load_unaligned(src - 2 + sstride)));
         const v128 c = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src - 1),
-                                v64_load_unaligned(src - 1 + stride)));
+                                v64_load_unaligned(src - 1 + sstride)));
         const v128 d = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src + 1),
-                                v64_load_unaligned(src + 1 + stride)));
+                                v64_load_unaligned(src + 1 + sstride)));
         const v128 e = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src + 2),
-                                v64_load_unaligned(src + 2 + stride)));
+                                v64_load_unaligned(src + 2 + sstride)));
         const v128 f = v128_add_8(
             c128, v128_from_v64(l2, v64_load_aligned(
-                                        src + ((y != bottom) + 1) * stride)));
+                                        src + ((y != bottom) + 1) * sstride)));
         const v128 tmp =
             v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
@@ -171,17 +171,18 @@
                                                         delta, v128_zero()))),
         v64_store_aligned(dst, v128_high_v64(o));
-        v64_store_aligned(dst + stride, v128_low_v64(o));
-        src += stride * 2;
-        dst += stride * 2;
+        v64_store_aligned(dst + dstride, v128_low_v64(o));
+        src += sstride * 2;
+        dst += dstride * 2;
-void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride,
-                               int x0, int y0, int sizex, int sizey, int width,
-                               int height, unsigned int strength) {
+void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
+                               int dstride, int x0, int y0, int sizex,
+                               int sizey, int width, int height,
+                               unsigned int strength) {
   // TODO(stemidts):
   // A sizex different from 8 will only be needed if CLPF is extended to chroma.
   // This will only be used if 4:2:0 and width not a multiple of 16 and along
@@ -189,9 +190,10 @@
   // this case.  If not extended to chroma, this test will be redundant.
   if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) {
     // Fallback to C for odd sizes
-    aom_clpf_block_c(src, dst, stride, x0, y0, sizex, sizey, width, height,
-                     strength);
+    aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
+                     height, strength);
   } else {
-    clpf_block(src, dst, stride, x0, y0, sizey, width, height, strength);
+    clpf_block(src, dst, sstride, dstride, x0, y0, sizey, width, height,
+               strength);
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index dc18944..6b2de8c 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -3929,19 +3929,10 @@
   if (cm->clpf_strength && !cm->skip_loop_filter) {
-    YV12_BUFFER_CONFIG dst;  // Buffer for the result
-    dst = pbi->cur_buf->buf;
-    CHECK_MEM_ERROR(cm, dst.y_buffer, aom_malloc(dst.y_stride * dst.y_height));
-    av1_clpf_frame(&dst, &pbi->cur_buf->buf, 0, cm, !!cm->clpf_size,
+    const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf;
+    av1_clpf_frame(frame, frame, 0, cm, !!cm->clpf_size,
                    cm->clpf_strength + (cm->clpf_strength == 3),
                    4 + cm->clpf_size, cm->clpf_blocks, clpf_bit);
-    // Copy result
-    memcpy(pbi->cur_buf->buf.y_buffer, dst.y_buffer,
-           dst.y_height * dst.y_stride);
-    aom_free(dst.y_buffer);
   if (cm->clpf_blocks) aom_free(cm->clpf_blocks);
diff --git a/test/ b/test/
index 786180b..755d1f1 100644
--- a/test/
+++ b/test/
@@ -26,9 +26,9 @@
 namespace {
-typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int stride,
-                             int x0, int y0, int sizex, int sizey, int width,
-                             int height, unsigned int strength);
+typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int sstride,
+                             int dstride, int x0, int y0, int sizex, int sizey,
+                             int width, int height, unsigned int strength);
 typedef std::tr1::tuple<clpf_block_t, clpf_block_t, int, int>
@@ -85,10 +85,10 @@
       for (ypos = 0; ypos < size && !error; ypos += h * !error) {
         for (xpos = 0; xpos < size && !error; xpos += w * !error) {
           for (strength = 0; strength < 3 && !error; strength += !error) {
-            ref_clpf(s, ref_d, size, xpos, ypos, w, h, size, size,
+            ref_clpf(s, ref_d, size, size, xpos, ypos, w, h, size, size,
                      1 << strength);
-                clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength));
+            ASM_REGISTER_STATE_CHECK(clpf(s, d, size, size, xpos, ypos, w, h,
+                                          size, size, 1 << strength));
             for (pos = 0; pos < size * size && !error; pos++) {
               error = ref_d[pos] != d[pos];
@@ -137,7 +137,8 @@
     for (ypos = 0; ypos < size; ypos += h) {
       for (xpos = 0; xpos < size; xpos += w) {
         for (strength = 0; strength < 3; strength++) {
-          ref_clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength);
+          ref_clpf(s, d, size, size, xpos, ypos, w, h, size, size,
+                   1 << strength);
@@ -150,7 +151,7 @@
     for (ypos = 0; ypos < size; ypos += h) {
       for (xpos = 0; xpos < size; xpos += w) {
         for (strength = 0; strength < 3; strength++) {
-          clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength);
+          clpf(s, d, size, size, xpos, ypos, w, h, size, size, 1 << strength);