Clean up and speed up CLPF clipping

* Move clipping tests from inside to outside loops
* Let sizex and sizey to clpf_block() be the clipped block size rather
  than both just bs
* Make fallback tests to C more accurate

Change-Id: Icdc57540ce21b41a95403fdcc37988a4ebf546c7
diff --git a/av1/common/clpf.c b/av1/common/clpf.c
index 9eef2b5..1cf5272 100644
--- a/av1/common/clpf.c
+++ b/av1/common/clpf.c
@@ -153,8 +153,11 @@
         // Iterate over all smaller blocks inside the filter block
         for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
           for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
+            int sizex, sizey;
             xpos = xoff + n * bs;
             ypos = yoff + m * bs;
+            sizex = AOMMIN(width - xpos, bs);
+            sizey = AOMMIN(height - ypos, bs);
             if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
                                      (xpos << subx) / MI_SIZE]
                      ->mbmi.skip) {  // Not skip block
@@ -164,30 +167,49 @@
 #if CONFIG_AOM_HIGHBITDEPTH
                 if (cm->use_highbitdepth) {
                   uint16_t *const d = CONVERT_TO_SHORTPTR(cache_dst[cache_idx]);
-                  for (c = 0; c < bs; c++) {
-                    *(uint64_t *)(d + c * sstride) =
-                        *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
-                    if (bs == 8)
+                  if (sizex == 8) {
+                    for (c = 0; c < sizey; c++) {
+                      *(uint64_t *)(d + c * sstride) =
+                          *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
                       *(uint64_t *)(d + c * sstride + 4) =
                           *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
+                    }
+                  } else if (sizex == 4) {
+                    for (c = 0; c < sizey; c++)
+                      *(uint64_t *)(d + c * sstride) =
+                          *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
+                  } else {
+                    for (c = 0; c < sizey; c++)
+                      memcpy(d + c * sstride, cache_ptr[cache_idx] + c * bs * 2,
+                             sizex);
                   }
                 } else {
-                  for (c = 0; c < bs; c++)
-                    if (bs == 8)
+                  if (sizex == 8)
+                    for (c = 0; c < sizey; c++)
                       *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
                           *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
-                    else
+                  else if (sizex == 4)
+                    for (c = 0; c < sizey; c++)
                       *(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
                           *(uint32_t *)(cache_ptr[cache_idx] + c * bs);
+                  else
+                    for (c = 0; c < sizey; c++)
+                      memcpy(cache_dst[cache_idx] + c * sstride,
+                             cache_ptr[cache_idx] + c * bs, sizex);
                 }
 #else
-                for (c = 0; c < bs; c++)
-                  if (bs == 8)
+                if (sizex == 8)
+                  for (c = 0; c < sizey; c++)
                     *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
                         *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
-                  else
+                else if (sizex == 4)
+                  for (c = 0; c < sizey; c++)
                     *(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
                         *(uint32_t *)(cache_ptr[cache_idx] + c * bs);
+                else
+                  for (c = 0; c < sizey; c++)
+                    memcpy(cache_dst[cache_idx] + c * sstride,
+                           cache_ptr[cache_idx] + c * bs, sizex);
 #endif
               }
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -211,15 +233,15 @@
               if (cm->use_highbitdepth) {
                 aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer),
                                    CONVERT_TO_SHORTPTR(dst_buffer), sstride,
-                                   dstride, xpos, ypos, bs, bs, width, height,
-                                   strength);
+                                   dstride, xpos, ypos, sizex, sizey, width,
+                                   height, strength);
               } else {
                 aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
-                               ypos, bs, bs, width, height, strength);
+                               ypos, sizex, sizey, width, height, strength);
               }
 #else
               aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
-                             ypos, bs, bs, width, height, strength);
+                             ypos, sizex, sizey, width, height, strength);
 #endif
             }
           }
diff --git a/av1/common/clpf_simd.h b/av1/common/clpf_simd.h
index 979856b..6fef4b7 100644
--- a/av1/common/clpf_simd.h
+++ b/av1/common/clpf_simd.h
@@ -76,24 +76,27 @@
     v128 o = v128_from_v64(l1, l2);
     const v128 a =
         v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
-    v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0),
-                           v64_load_unaligned(src - 2 * !!x0 + sstride));
-    v128 c = v128_from_v64(v64_load_unaligned(src - !!x0),
-                           v64_load_unaligned(src - !!x0 + sstride));
-    v128 d = v128_from_v64(v64_load_unaligned(src + !!right),
-                           v64_load_unaligned(src + !!right + sstride));
-    v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right),
-                           v64_load_unaligned(src + 2 * !!right + sstride));
     const v128 f = v128_from_v64(
         l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
+    v128 b, c, d, e;
 
-    if (!x0) {  // Left clipping
-      b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
-      c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
+    if (x0) {
+      b = v128_from_v64(v64_load_unaligned(src - 2),
+                        v64_load_unaligned(src - 2 + sstride));
+      c = v128_from_v64(v64_load_unaligned(src - 1),
+                        v64_load_unaligned(src - 1 + sstride));
+    } else {  // Left clipping
+      b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
+      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
     }
-    if (!right) {  // Right clipping
-      d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
-      e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
+    if (right) {
+      d = v128_from_v64(v64_load_unaligned(src + 1),
+                        v64_load_unaligned(src + 1 + sstride));
+      e = v128_from_v64(v64_load_unaligned(src + 2),
+                        v64_load_unaligned(src + 2 + sstride));
+    } else {  // Right clipping
+      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
+      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
     }
 
     o = calc_delta(o, a, b, c, d, e, f, sp, sm);
@@ -134,31 +137,34 @@
     const uint32_t l5 = u32_load_aligned(src + ((y != bottom) + 3) * sstride);
     v128 o = v128_from_32(l1, l2, l3, l4);
     const v128 a = v128_from_32(l0, l1, l2, l3);
-    v128 b = v128_from_32(u32_load_unaligned(src - 2 * !!x0),
-                          u32_load_unaligned(src + sstride - 2 * !!x0),
-                          u32_load_unaligned(src + 2 * sstride - 2 * !!x0),
-                          u32_load_unaligned(src + 3 * sstride - 2 * !!x0));
-    v128 c = v128_from_32(u32_load_unaligned(src - !!x0),
-                          u32_load_unaligned(src + sstride - !!x0),
-                          u32_load_unaligned(src + 2 * sstride - !!x0),
-                          u32_load_unaligned(src + 3 * sstride - !!x0));
-    v128 d = v128_from_32(u32_load_unaligned(src + !!right),
-                          u32_load_unaligned(src + sstride + !!right),
-                          u32_load_unaligned(src + 2 * sstride + !!right),
-                          u32_load_unaligned(src + 3 * sstride + !!right));
-    v128 e = v128_from_32(u32_load_unaligned(src + 2 * !!right),
-                          u32_load_unaligned(src + sstride + 2 * !!right),
-                          u32_load_unaligned(src + 2 * sstride + 2 * !!right),
-                          u32_load_unaligned(src + 3 * sstride + 2 * !!right));
     const v128 f = v128_from_32(l2, l3, l4, l5);
+    v128 b, c, d, e;
 
-    if (!x0) {  // Left clipping
-      b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
-      c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
+    if (x0) {
+      b = v128_from_32(u32_load_unaligned(src - 2),
+                       u32_load_unaligned(src + sstride - 2),
+                       u32_load_unaligned(src + 2 * sstride - 2),
+                       u32_load_unaligned(src + 3 * sstride - 2));
+      c = v128_from_32(u32_load_unaligned(src - 1),
+                       u32_load_unaligned(src + sstride - 1),
+                       u32_load_unaligned(src + 2 * sstride - 1),
+                       u32_load_unaligned(src + 3 * sstride - 1));
+    } else {  // Left clipping
+      b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
+      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
     }
-    if (!right) {  // Right clipping
-      d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
-      e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
+    if (right) {
+      d = v128_from_32(u32_load_unaligned(src + 1),
+                       u32_load_unaligned(src + sstride + 1),
+                       u32_load_unaligned(src + 2 * sstride + 1),
+                       u32_load_unaligned(src + 3 * sstride + 1));
+      e = v128_from_32(u32_load_unaligned(src + 2 * !!right),
+                       u32_load_unaligned(src + sstride + 2),
+                       u32_load_unaligned(src + 2 * sstride + 2),
+                       u32_load_unaligned(src + 3 * sstride + 2));
+    } else {  // Right clipping
+      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
+      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
     }
 
     o = calc_delta(o, a, b, c, d, e, f, sp, sm);
@@ -176,9 +182,10 @@
                                int dstride, int x0, int y0, int sizex,
                                int sizey, int width, int height,
                                unsigned int strength) {
-  if ((sizex != 4 && sizex != 8) || y0 + 4 > height ||
-      (sizey & 3 && sizex == 4) || x0 + 4 > width) {
-    // Fallback to C for odd sizes
+  if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
+    // Fallback to C for odd sizes:
+    // * block widths not 4 or 8
+    // * block heights not a multiple of 4 if the block width is 4
     aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
                      height, strength);
   } else {
@@ -255,24 +262,27 @@
     v128 o = v128_from_v64(l1, l2);
     const v128 a =
         v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
-    v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0),
-                           v64_load_unaligned(src - 2 * !!x0 + sstride));
-    v128 c = v128_from_v64(v64_load_unaligned(src - !!x0),
-                           v64_load_unaligned(src - !!x0 + sstride));
-    v128 d = v128_from_v64(v64_load_unaligned(src + !!right),
-                           v64_load_unaligned(src + !!right + sstride));
-    v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right),
-                           v64_load_unaligned(src + 2 * !!right + sstride));
     const v128 f = v128_from_v64(
         l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
+    v128 b, c, d, e;
 
-    if (!x0) {  // Left clipping
-      b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
-      c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
+    if (x0) {
+      b = v128_from_v64(v64_load_unaligned(src - 2),
+                        v64_load_unaligned(src - 2 + sstride));
+      c = v128_from_v64(v64_load_unaligned(src - 1),
+                        v64_load_unaligned(src - 1 + sstride));
+    } else {  // Left clipping
+      b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
+      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
     }
-    if (!right) {  // Right clipping
-      d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
-      e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
+    if (right) {
+      d = v128_from_v64(v64_load_unaligned(src + 1),
+                        v64_load_unaligned(src + 1 + sstride));
+      e = v128_from_v64(v64_load_unaligned(src + 2),
+                        v64_load_unaligned(src + 2 + sstride));
+    } else {  // Right clipping
+      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
+      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
     }
     calc_delta_hbd4(o, a, b, c, d, e, f, dst, sp, sm, dstride);
     src += sstride * 2;
@@ -309,18 +319,21 @@
     const v128 o = v128_load_aligned(src);
     const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
     const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
-    v128 b = v128_load_unaligned(src - 2 * !!x0);
-    v128 c = v128_load_unaligned(src - !!x0);
-    v128 d = v128_load_unaligned(src + !!right);
-    v128 e = v128_load_unaligned(src + 2 * !!right);
+    v128 b, c, d, e;
 
-    if (!x0) {  // Left clipping
-      b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
-      c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
+    if (x0) {
+      b = v128_load_unaligned(src - 2);
+      c = v128_load_unaligned(src - 1);
+    } else {  // Left clipping
+      b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
+      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
     }
-    if (!right) {  // Right clipping
-      d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
-      e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
+    if (right) {
+      d = v128_load_unaligned(src + 1);
+      e = v128_load_unaligned(src + 2);
+    } else {  // Right clipping
+      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
+      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
     }
     calc_delta_hbd8(o, a, b, c, d, e, f, dst, sp, sm);
     src += sstride;
@@ -332,8 +345,10 @@
                                    int sstride, int dstride, int x0, int y0,
                                    int sizex, int sizey, int width, int height,
                                    unsigned int strength) {
-  if ((sizex != 4 && sizex != 8) || y0 + 4 > height || x0 + 4 > width) {
-    // Fallback to C for odd sizes
+  if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
+    // Fallback to C for odd sizes:
+    // * block width not 4 or 8
+    // * block heights not a multiple of 2 if the block width is 4
     aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
                          width, height, strength);
   } else {