Reduce memory footprint for CLPF decoding.

Instead of having CLPF write to an entire new frame and
copy the result back into the original frame, make the
filter able to work in-place by keeping a buffer of size
frame_width*filter_block_size and delay the write-back
by one filter_block_size row.

This reduces the cycles spent in the filter to ~75%.

Change-Id: I78ca74380c45492daa8935d08d766851edb5fbc1
diff --git a/test/clpf_test.cc b/test/clpf_test.cc
index 786180b..755d1f1 100644
--- a/test/clpf_test.cc
+++ b/test/clpf_test.cc
@@ -26,9 +26,9 @@
 
 namespace {
 
-typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int stride,
-                             int x0, int y0, int sizex, int sizey, int width,
-                             int height, unsigned int strength);
+typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int sstride,
+                             int dstride, int x0, int y0, int sizex, int sizey,
+                             int width, int height, unsigned int strength);
 
 typedef std::tr1::tuple<clpf_block_t, clpf_block_t, int, int>
     clpf_block_param_t;
@@ -85,10 +85,10 @@
       for (ypos = 0; ypos < size && !error; ypos += h * !error) {
         for (xpos = 0; xpos < size && !error; xpos += w * !error) {
           for (strength = 0; strength < 3 && !error; strength += !error) {
-            ref_clpf(s, ref_d, size, xpos, ypos, w, h, size, size,
+            ref_clpf(s, ref_d, size, size, xpos, ypos, w, h, size, size,
                      1 << strength);
-            ASM_REGISTER_STATE_CHECK(
-                clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength));
+            ASM_REGISTER_STATE_CHECK(clpf(s, d, size, size, xpos, ypos, w, h,
+                                          size, size, 1 << strength));
 
             for (pos = 0; pos < size * size && !error; pos++) {
               error = ref_d[pos] != d[pos];
@@ -137,7 +137,8 @@
     for (ypos = 0; ypos < size; ypos += h) {
       for (xpos = 0; xpos < size; xpos += w) {
         for (strength = 0; strength < 3; strength++) {
-          ref_clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength);
+          ref_clpf(s, d, size, size, xpos, ypos, w, h, size, size,
+                   1 << strength);
         }
       }
     }
@@ -150,7 +151,7 @@
     for (ypos = 0; ypos < size; ypos += h) {
       for (xpos = 0; xpos < size; xpos += w) {
         for (strength = 0; strength < 3; strength++) {
-          clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength);
+          clpf(s, d, size, size, xpos, ypos, w, h, size, size, 1 << strength);
         }
       }
     }