Reduce memory footprint for CLPF decoding. Instead of having CLPF write to an entire new frame and copy the result back into the original frame, make the filter able to work in-place by keeping a buffer of size frame_width*filter_block_size and delay the write-back by one filter_block_size row. This reduces the cycles spent in the filter to ~75%. Change-Id: I78ca74380c45492daa8935d08d766851edb5fbc1

commit: e8224c7ad5747d1888ddb181839f205a1752afe0 [log] [tgz]
author: Steinar Midtskogen <stemidts@cisco.com> Wed Aug 24 13:00:04 2016 +0200
committer: Yaowu Xu <yaowu@google.com> Mon Oct 10 11:26:33 2016 -0700
tree: 3587f06b60afb726591ff6fe2fa1b100819c5dbe
parent: 34dac00adc8221a8cc0a7f1c7d8257c5dbc5794f [diff] [blame]
diff --git a/av1/common/clpf_simd.h b/av1/common/clpf_simd.h
index 0df6cd7..544aa36 100644
--- a/av1/common/clpf_simd.h
+++ b/av1/common/clpf_simd.h

@@ -11,11 +11,11 @@
 
 #include "./aom_dsp_rtcd.h"
 
-static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
-                       int y0, int sizey, int width, int height,
-                       unsigned int strength) {
-  dst += x0 + y0 * stride;
-  src += x0 + y0 * stride;
+static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride,
+                       int dstride, int x0, int y0, int sizey, int width,
+                       int height, unsigned int strength) {
+  dst += x0 + y0 * dstride;
+  src += x0 + y0 * sstride;
   {
     int bottom = height - 2 - y0;
     const v128 sp = v128_dup_8(strength);
@@ -32,23 +32,23 @@
 
       for (y = 0; y < sizey; y += 2) {
         const v64 l1 = v64_load_aligned(src);
-        const v64 l2 = v64_load_aligned(src + stride);
+        const v64 l2 = v64_load_aligned(src + sstride);
         v128 o = v128_from_v64(l1, l2);
         const v128 x = v128_add_8(c128, o);
         const v128 a = v128_add_8(
             c128,
-            v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
+            v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
         const v128 b = v128_shuffle_8(x, b_shuff);
         const v128 c = v128_shuffle_8(x, c_shuff);
         const v128 d = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src + 1),
-                                v64_load_unaligned(src + 1 + stride)));
+                                v64_load_unaligned(src + 1 + sstride)));
         const v128 e = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src + 2),
-                                v64_load_unaligned(src + 2 + stride)));
+                                v64_load_unaligned(src + 2 + sstride)));
         const v128 f = v128_add_8(
             c128, v128_from_v64(l2, v64_load_aligned(
-                                        src + ((y != bottom) + 1) * stride)));
+                                        src + ((y != bottom) + 1) * sstride)));
 
         const v128 tmp =
             v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
@@ -70,9 +70,9 @@
                                                         delta, v128_zero()))),
                    4));
         v64_store_aligned(dst, v128_high_v64(o));
-        v64_store_aligned(dst + stride, v128_low_v64(o));
-        src += stride * 2;
-        dst += stride * 2;
+        v64_store_aligned(dst + dstride, v128_low_v64(o));
+        src += sstride * 2;
+        dst += dstride * 2;
       }
     } else if (!(width - x0 - 8)) {  // Clip right
       const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
@@ -83,23 +83,23 @@
 
       for (y = 0; y < sizey; y += 2) {
         const v64 l1 = v64_load_aligned(src);
-        const v64 l2 = v64_load_aligned(src + stride);
+        const v64 l2 = v64_load_aligned(src + sstride);
         v128 o = v128_from_v64(l1, l2);
         const v128 x = v128_add_8(c128, o);
         const v128 a = v128_add_8(
             c128,
-            v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
+            v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
         const v128 b = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src - 2),
-                                v64_load_unaligned(src - 2 + stride)));
+                                v64_load_unaligned(src - 2 + sstride)));
         const v128 c = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src - 1),
-                                v64_load_unaligned(src - 1 + stride)));
+                                v64_load_unaligned(src - 1 + sstride)));
         const v128 d = v128_shuffle_8(x, d_shuff);
         const v128 e = v128_shuffle_8(x, e_shuff);
         const v128 f = v128_add_8(
             c128, v128_from_v64(l2, v64_load_aligned(
-                                        src + ((y != bottom) + 1) * stride)));
+                                        src + ((y != bottom) + 1) * sstride)));
 
         const v128 tmp =
             v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
@@ -121,35 +121,35 @@
                                                         delta, v128_zero()))),
                    4));
         v64_store_aligned(dst, v128_high_v64(o));
-        v64_store_aligned(dst + stride, v128_low_v64(o));
-        src += stride * 2;
-        dst += stride * 2;
+        v64_store_aligned(dst + dstride, v128_low_v64(o));
+        src += sstride * 2;
+        dst += dstride * 2;
       }
     } else {  // No left/right clipping
       int y;
       for (y = 0; y < sizey; y += 2) {
         const v64 l1 = v64_load_aligned(src);
-        const v64 l2 = v64_load_aligned(src + stride);
+        const v64 l2 = v64_load_aligned(src + sstride);
         v128 o = v128_from_v64(l1, l2);
         const v128 x = v128_add_8(c128, o);
         const v128 a = v128_add_8(
             c128,
-            v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
+            v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
         const v128 b = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src - 2),
-                                v64_load_unaligned(src - 2 + stride)));
+                                v64_load_unaligned(src - 2 + sstride)));
         const v128 c = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src - 1),
-                                v64_load_unaligned(src - 1 + stride)));
+                                v64_load_unaligned(src - 1 + sstride)));
         const v128 d = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src + 1),
-                                v64_load_unaligned(src + 1 + stride)));
+                                v64_load_unaligned(src + 1 + sstride)));
         const v128 e = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src + 2),
-                                v64_load_unaligned(src + 2 + stride)));
+                                v64_load_unaligned(src + 2 + sstride)));
         const v128 f = v128_add_8(
             c128, v128_from_v64(l2, v64_load_aligned(
-                                        src + ((y != bottom) + 1) * stride)));
+                                        src + ((y != bottom) + 1) * sstride)));
 
         const v128 tmp =
             v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
@@ -171,17 +171,18 @@
                                                         delta, v128_zero()))),
                    4));
         v64_store_aligned(dst, v128_high_v64(o));
-        v64_store_aligned(dst + stride, v128_low_v64(o));
-        src += stride * 2;
-        dst += stride * 2;
+        v64_store_aligned(dst + dstride, v128_low_v64(o));
+        src += sstride * 2;
+        dst += dstride * 2;
       }
     }
   }
 }
 
-void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride,
-                               int x0, int y0, int sizex, int sizey, int width,
-                               int height, unsigned int strength) {
+void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
+                               int dstride, int x0, int y0, int sizex,
+                               int sizey, int width, int height,
+                               unsigned int strength) {
   // TODO(stemidts):
   // A sizex different from 8 will only be needed if CLPF is extended to chroma.
   // This will only be used if 4:2:0 and width not a multiple of 16 and along
@@ -189,9 +190,10 @@
   // this case.  If not extended to chroma, this test will be redundant.
   if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) {
     // Fallback to C for odd sizes
-    aom_clpf_block_c(src, dst, stride, x0, y0, sizex, sizey, width, height,
-                     strength);
+    aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
+                     height, strength);
   } else {
-    clpf_block(src, dst, stride, x0, y0, sizey, width, height, strength);
+    clpf_block(src, dst, sstride, dstride, x0, y0, sizey, width, height,
+               strength);
   }
 }
commit	e8224c7ad5747d1888ddb181839f205a1752afe0	[log] [tgz]
author	Steinar Midtskogen <stemidts@cisco.com>	Wed Aug 24 13:00:04 2016 +0200
committer	Yaowu Xu <yaowu@google.com>	Mon Oct 10 11:26:33 2016 -0700
tree	3587f06b60afb726591ff6fe2fa1b100819c5dbe
parent	34dac00adc8221a8cc0a7f1c7d8257c5dbc5794f [diff] [blame]