Move CLPF block signals from frame to SB level. These signals were in the uncompressed frame header (as a temporary hack), which caused two problems: * We don't want that header to be duplicated in the slice header * It was necessary to signal the number of bits to transmit up front However, the filter size can be 128x128 which is greater than the SB size, and a decoder wouldn't be able to know whether to read a bit or not until the final SB of that 128x128 block has been decoded (depending on whether the 128x128 is all skip or not). Therefore the signalling was changed for 128x128 blocks so that every top left SB of a 128x128 filter block contains a signal regardless of whether the block is all skip or not. Also, all the MB's of 128x128 block are filtered even if they are skip MB's. This gives the signal a purpose even when the 128x128 block is all skip, and it also gives a slight coding gain as it leaves a way to filter skip blocks, which was previously forbidden. Low latency: PSNR YCbCr: -0.19% -0.14% -0.06% PSNRHVS: -0.15% SSIM: -0.13% MSSSIM: -0.15% CIEDE2000: -0.19% High latency: PSNR YCbCr: -0.03% -0.01% -0.09% PSNRHVS: 0.04% SSIM: 0.00% MSSSIM: 0.02% CIEDE2000: -0.02% Change-Id: I69ba7144d07d388b4f0968f6a53558f480979171

commit: 85437b211a79804eb668d97aa54375c9ed64f209 [log] [tgz]
author: Steinar Midtskogen <stemidts@cisco.com> Wed Sep 21 13:38:16 2016 +0200
committer: Steinar Midtskogen <stemidts@cisco.com> Tue Oct 04 12:55:15 2016 +0000
tree: 4acf5b309e8086c855c48d12392e869da05e0412
parent: f041c0ff9a2478acd66056e2f1140fa3bb2f9e3f [diff]
diff --git a/av1/common/clpf.c b/av1/common/clpf.c
index 1cf5272..a01e6b4 100644
--- a/av1/common/clpf.c
+++ b/av1/common/clpf.c

@@ -14,14 +14,6 @@
 #include "aom/aom_image.h"
 #include "aom_dsp/aom_dsp_common.h"
 
-int av1_clpf_maxbits(const AV1_COMMON *cm) {
-  return get_msb(
-             ALIGN_POWER_OF_TWO(cm->mi_cols * MI_SIZE, cm->clpf_size + 4) *
-                 ALIGN_POWER_OF_TWO(cm->mi_rows * MI_SIZE, cm->clpf_size + 4) >>
-             (cm->clpf_size * 2 + 8)) +
-         1;
-}
-
 int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b) {
   int delta = 4 * clamp(A - X, -b, b) + clamp(B - X, -b, b) +
               3 * clamp(C - X, -b, b) + 3 * clamp(D - X, -b, b) +
@@ -73,14 +65,14 @@
 #endif
 
 // Return number of filtered blocks
-int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
-                   const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
-                   int enable_fb_flag, unsigned int strength,
-                   unsigned int fb_size_log2, uint8_t *blocks, int plane,
-                   int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
-                                   const YV12_BUFFER_CONFIG *,
-                                   const AV1_COMMON *cm, int, int, int,
-                                   unsigned int, unsigned int, uint8_t *)) {
+void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
+                    const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
+                    int enable_fb_flag, unsigned int strength,
+                    unsigned int fb_size_log2, int plane,
+                    int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
+                                    const YV12_BUFFER_CONFIG *,
+                                    const AV1_COMMON *cm, int, int, int,
+                                    unsigned int, unsigned int, int8_t *)) {
   /* Constrained low-pass filter (CLPF) */
   int c, k, l, m, n;
   const int subx = plane != AOM_PLANE_Y && frame->subsampling_x;
@@ -95,7 +87,6 @@
   int dstride = bs;
   const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
   const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
-  int block_index = 0;
   uint8_t *cache = NULL;
   uint8_t **cache_ptr = NULL;
   uint8_t **cache_dst = NULL;
@@ -125,7 +116,7 @@
   for (k = 0; k < num_fb_ver; k++) {
     for (l = 0; l < num_fb_hor; l++) {
       int h, w;
-      int allskip = 1;
+      int allskip = !(enable_fb_flag && fb_size_log2 == MAX_FB_SIZE_LOG2);
       const int xoff = l << fb_size_log2;
       const int yoff = k << fb_size_log2;
       for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) {
@@ -148,8 +139,11 @@
       w += !w << fb_size_log2;
       if (!allskip &&  // Do not filter the block if all is skip encoded
           (!enable_fb_flag ||
+           // Only called if fb_flag enabled (luma only)
            decision(k, l, frame, org, cm, bs, w / bs, h / bs, strength,
-                    fb_size_log2, blocks + block_index))) {
+                    fb_size_log2,
+                    cm->clpf_blocks + yoff / MIN_FB_SIZE * cm->clpf_stride +
+                        xoff / MIN_FB_SIZE))) {
         // Iterate over all smaller blocks inside the filter block
         for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
           for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
@@ -160,8 +154,9 @@
             sizey = AOMMIN(height - ypos, bs);
             if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
                                      (xpos << subx) / MI_SIZE]
-                     ->mbmi.skip) {  // Not skip block
-              // Temporary buffering needed if filtering in-place
+                     ->mbmi.skip ||
+                (enable_fb_flag && fb_size_log2 == MAX_FB_SIZE_LOG2)) {
+              // Temporary buffering needed for in-place filtering
               if (cache_ptr[cache_idx]) {
 // Copy filtered block back into the frame
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -247,7 +242,6 @@
           }
         }
       }
-      block_index += !allskip;  // Count number of blocks filtered
     }
   }
 
@@ -287,6 +281,4 @@
   aom_free(cache);
   aom_free(cache_ptr);
   aom_free(cache_dst);
-
-  return block_index;
 }

diff --git a/av1/common/clpf.h b/av1/common/clpf.h
index 8e4213b..fc74f2c 100644
--- a/av1/common/clpf.h
+++ b/av1/common/clpf.h

@@ -13,17 +13,19 @@
 
 #include "av1/common/reconinter.h"
 
-#define MAX_FB_SIZE 128
+#define MAX_FB_SIZE_LOG2 7
+#define MIN_FB_SIZE_LOG2 5
+#define MAX_FB_SIZE (1 << MAX_FB_SIZE_LOG2)
+#define MIN_FB_SIZE (1 << MIN_FB_SIZE_LOG2)
 
-int av1_clpf_maxbits(const AV1_COMMON *cm);
 int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
-int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
-                   const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
-                   int enable_fb_flag, unsigned int strength,
-                   unsigned int fb_size_log2, uint8_t *blocks, int plane,
-                   int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
-                                   const YV12_BUFFER_CONFIG *,
-                                   const AV1_COMMON *cm, int, int, int,
-                                   unsigned int, unsigned int, uint8_t *));
+void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
+                    const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
+                    int enable_fb_flag, unsigned int strength,
+                    unsigned int fb_size_log2, int plane,
+                    int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
+                                    const YV12_BUFFER_CONFIG *,
+                                    const AV1_COMMON *cm, int, int, int,
+                                    unsigned int, unsigned int, int8_t *));
 
 #endif

diff --git a/av1/common/enums.h b/av1/common/enums.h
index 879f1f2..5cf898c 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h

@@ -181,6 +181,16 @@
 } PALETTE_COLOR;
 #endif  // CONFIG_PALETTE
 
+#ifdef CONFIG_CLPF
+#define CLPF_NOFLAG -1
+typedef enum {
+  CLPF_NOSIZE = 0,
+  CLPF_32X32 = 1,
+  CLPF_64X64 = 2,
+  CLPF_128X128 = 3
+} CLPF_BLOCK_SIZE;
+#endif
+
 #define DC_PRED 0    // Average of above and left pixels
 #define V_PRED 1     // Vertical
 #define H_PRED 2     // Horizontal

diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index 9e24196..1aebb0f 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h

@@ -152,12 +152,27 @@
 #endif
 
 #if CONFIG_CLPF
-  int clpf_numblocks;
-  int clpf_size;
+  // Two bits are used to signal the strength for all blocks and the
+  // valid values are:
+  // 0: no filtering
+  // 1: strength = 1
+  // 2: strength = 2
+  // 3: strength = 4
   int clpf_strength_y;
   int clpf_strength_u;
   int clpf_strength_v;
-  uint8_t *clpf_blocks;
+
+  // If clpf_strength_y is not 0, another two bits are used to signal
+  // the filter block size.  The valid values for clfp_size are:
+  // 0: no block signalling
+  // 1: 32x32
+  // 2: 64x64
+  // 3: 128x128
+  CLPF_BLOCK_SIZE clpf_size;
+
+  // Buffer for storing whether to filter individual blocks.
+  int8_t *clpf_blocks;
+  int clpf_stride;
 #endif
 
   YV12_BUFFER_CONFIG *frame_to_show;

diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index d9330d7..4d0e3ee 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c

@@ -581,6 +581,23 @@
   return p;
 }
 
+#if CONFIG_CLPF
+static int clpf_all_skip(const AV1_COMMON *cm, int mi_col, int mi_row,
+                         int size) {
+  int r, c;
+  int skip = 1;
+  const int maxc = AOMMIN(size, cm->mi_cols - mi_col);
+  const int maxr = AOMMIN(size, cm->mi_rows - mi_row);
+  for (r = 0; r < maxr && skip; r++) {
+    for (c = 0; c < maxc && skip; c++) {
+      skip &= !!cm->mi_grid_visible[(mi_row + r) * cm->mi_stride + mi_col + c]
+                    ->mbmi.skip;
+    }
+  }
+  return skip;
+}
+#endif
+
 // TODO(slavarnway): eliminate bsize and subsize in future commits
 static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
                              int mi_row, int mi_col, aom_reader *r,
@@ -637,6 +654,43 @@
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
     dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
 
+#if CONFIG_CLPF
+  if (bsize == BLOCK_64X64 && cm->clpf_strength_y &&
+      cm->clpf_size != CLPF_NOSIZE) {
+    const int tl = mi_row * MI_SIZE / MIN_FB_SIZE * cm->clpf_stride +
+                   mi_col * MI_SIZE / MIN_FB_SIZE;
+
+    if (!((mi_row * MI_SIZE) & 127) && !((mi_col * MI_SIZE) & 127) &&
+        cm->clpf_size == CLPF_128X128) {
+      cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR);
+    } else if (cm->clpf_size == CLPF_64X64 &&
+               !clpf_all_skip(cm, mi_col, mi_row, 64 / MI_SIZE)) {
+      cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR);
+    } else if (cm->clpf_size == CLPF_32X32) {
+      const int tr = tl + 1;
+      const int bl = tl + cm->clpf_stride;
+      const int br = tr + cm->clpf_stride;
+      const int size = 32 / MI_SIZE;
+
+      // Up to four bits per SB
+      if (!clpf_all_skip(cm, mi_col, mi_row, size))
+        cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR);
+
+      if (mi_col + size < cm->mi_cols &&
+          !clpf_all_skip(cm, mi_col + size, mi_row, size))
+        cm->clpf_blocks[tr] = aom_read_literal(r, 1, ACCT_STR);
+
+      if (mi_row + size < cm->mi_rows &&
+          !clpf_all_skip(cm, mi_col, mi_row + size, size))
+        cm->clpf_blocks[bl] = aom_read_literal(r, 1, ACCT_STR);
+
+      if (mi_col + size < cm->mi_cols && mi_row + size < cm->mi_rows &&
+          !clpf_all_skip(cm, mi_col + size, mi_row + size, size))
+        cm->clpf_blocks[br] = aom_read_literal(r, 1, ACCT_STR);
+    }
+  }
+#endif
+
 #if CONFIG_DERING
   if (bsize == BLOCK_64X64) {
     if (cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) {
@@ -765,20 +819,26 @@
 }
 
 #if CONFIG_CLPF
-static void setup_clpf(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+static void setup_clpf(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int width = pbi->cur_buf->buf.y_crop_width;
+  const int height = pbi->cur_buf->buf.y_crop_height;
+
   cm->clpf_blocks = 0;
   cm->clpf_strength_y = aom_rb_read_literal(rb, 2);
   cm->clpf_strength_u = aom_rb_read_literal(rb, 2);
   cm->clpf_strength_v = aom_rb_read_literal(rb, 2);
   if (cm->clpf_strength_y) {
     cm->clpf_size = aom_rb_read_literal(rb, 2);
-    if (cm->clpf_size) {
-      int i;
-      cm->clpf_numblocks = aom_rb_read_literal(rb, av1_clpf_maxbits(cm));
-      CHECK_MEM_ERROR(cm, cm->clpf_blocks, aom_malloc(cm->clpf_numblocks));
-      for (i = 0; i < cm->clpf_numblocks; i++) {
-        cm->clpf_blocks[i] = aom_rb_read_literal(rb, 1);
-      }
+    if (cm->clpf_size != CLPF_NOSIZE) {
+      int size;
+      cm->clpf_stride =
+          ((width + MIN_FB_SIZE - 1) & ~(MIN_FB_SIZE - 1)) >> MIN_FB_SIZE_LOG2;
+      size =
+          cm->clpf_stride * ((height + MIN_FB_SIZE - 1) & ~(MIN_FB_SIZE - 1)) >>
+          MIN_FB_SIZE_LOG2;
+      CHECK_MEM_ERROR(cm, cm->clpf_blocks, aom_malloc(size));
+      memset(cm->clpf_blocks, -1, size);
     }
   }
 }
@@ -788,7 +848,7 @@
                     UNUSED const YV12_BUFFER_CONFIG *org,
                     UNUSED const AV1_COMMON *cm, UNUSED int block_size,
                     UNUSED int w, UNUSED int h, UNUSED unsigned int strength,
-                    UNUSED unsigned int fb_size_log2, uint8_t *bit) {
+                    UNUSED unsigned int fb_size_log2, int8_t *bit) {
   return *bit;
 }
 #endif
@@ -1897,7 +1957,7 @@
 
   setup_loopfilter(&cm->lf, rb);
 #if CONFIG_CLPF
-  setup_clpf(cm, rb);
+  setup_clpf(pbi, rb);
 #endif
 #if CONFIG_DERING
   setup_dering(cm, rb);
@@ -2329,18 +2389,18 @@
   if (!cm->skip_loop_filter) {
     const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf;
     if (cm->clpf_strength_y) {
-      av1_clpf_frame(frame, NULL, cm, !!cm->clpf_size,
+      av1_clpf_frame(frame, NULL, cm, cm->clpf_size != CLPF_NOSIZE,
                      cm->clpf_strength_y + (cm->clpf_strength_y == 3),
-                     4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, clpf_bit);
+                     4 + cm->clpf_size, AOM_PLANE_Y, clpf_bit);
     }
     if (cm->clpf_strength_u) {
-      av1_clpf_frame(frame, NULL, cm, 0,
-                     cm->clpf_strength_u + (cm->clpf_strength_u == 3), 4, NULL,
+      av1_clpf_frame(frame, NULL, cm, 0,  // No block signals for chroma
+                     cm->clpf_strength_u + (cm->clpf_strength_u == 3), 4,
                      AOM_PLANE_U, NULL);
     }
     if (cm->clpf_strength_v) {
-      av1_clpf_frame(frame, NULL, cm, 0,
-                     cm->clpf_strength_v + (cm->clpf_strength_v == 3), 4, NULL,
+      av1_clpf_frame(frame, NULL, cm, 0,  // No block signals for chroma
+                     cm->clpf_strength_v + (cm->clpf_strength_v == 3), 4,
                      AOM_PLANE_V, NULL);
     }
   }

diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 06d55d9..110a39a 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c

@@ -1192,6 +1192,37 @@
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 
+#if CONFIG_CLPF
+  if (bsize == BLOCK_64X64 && cm->clpf_blocks && cm->clpf_strength_y &&
+      cm->clpf_size != CLPF_NOSIZE) {
+    const int tl = mi_row * MI_SIZE / MIN_FB_SIZE * cm->clpf_stride +
+                   mi_col * MI_SIZE / MIN_FB_SIZE;
+    const int tr = tl + 1;
+    const int bl = tl + cm->clpf_stride;
+    const int br = tr + cm->clpf_stride;
+
+    // Up to four bits per SB.
+    // When clpf_size indicates a size larger than the SB size
+    // (CLPF_128X128), one bit for every fourth SB will be transmitted
+    // regardless of skip blocks.
+    if (cm->clpf_blocks[tl] != CLPF_NOFLAG)
+      aom_write_literal(w, cm->clpf_blocks[tl], 1);
+
+    if (mi_col + MI_SIZE / 2 < cm->mi_cols &&
+        cm->clpf_blocks[tr] != CLPF_NOFLAG)
+      aom_write_literal(w, cm->clpf_blocks[tr], 1);
+
+    if (mi_row + MI_SIZE / 2 < cm->mi_rows &&
+        cm->clpf_blocks[bl] != CLPF_NOFLAG)
+      aom_write_literal(w, cm->clpf_blocks[bl], 1);
+
+    if (mi_row + MI_SIZE / 2 < cm->mi_rows &&
+        mi_col + MI_SIZE / 2 < cm->mi_cols &&
+        cm->clpf_blocks[br] != CLPF_NOFLAG)
+      aom_write_literal(w, cm->clpf_blocks[br], 1);
+  }
+#endif
+
 #if CONFIG_DERING
   if (bsize == BLOCK_64X64 && cm->dering_level != 0 &&
       !sb_all_skip(cm, mi_row, mi_col)) {
@@ -1462,18 +1493,6 @@
   aom_wb_write_literal(wb, cm->clpf_strength_v, 2);
   if (cm->clpf_strength_y) {
     aom_wb_write_literal(wb, cm->clpf_size, 2);
-    if (cm->clpf_size) {
-      int i;
-      // TODO(stemidts): The number of bits to transmit could be
-      // implicitly deduced if transmitted after the filter block or
-      // after the frame (when it's known whether the block is all
-      // skip and implicitly unfiltered).  And the bits do not have
-      // 50% probability, so a more efficient coding is possible.
-      aom_wb_write_literal(wb, cm->clpf_numblocks, av1_clpf_maxbits(cm));
-      for (i = 0; i < cm->clpf_numblocks; i++) {
-        aom_wb_write_literal(wb, cm->clpf_blocks ? cm->clpf_blocks[i] : 0, 1);
-      }
-    }
   }
 }
 #endif

diff --git a/av1/encoder/clpf_rdo.c b/av1/encoder/clpf_rdo.c
index 1d498f1..4e652b6 100644
--- a/av1/encoder/clpf_rdo.c
+++ b/av1/encoder/clpf_rdo.c

@@ -127,14 +127,15 @@
 int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
                       const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
                       int block_size, int w, int h, unsigned int strength,
-                      unsigned int fb_size_log2, uint8_t *res) {
+                      unsigned int fb_size_log2, int8_t *res) {
   int m, n, sum0 = 0, sum1 = 0;
 
   for (m = 0; m < h; m++) {
     for (n = 0; n < w; n++) {
       int xpos = (l << fb_size_log2) + n * block_size;
       int ypos = (k << fb_size_log2) + m * block_size;
-      if (!cm->mi_grid_visible[ypos / MI_SIZE * cm->mi_stride + xpos / MI_SIZE]
+      if (fb_size_log2 == MAX_FB_SIZE_LOG2 ||
+          !cm->mi_grid_visible[ypos / MI_SIZE * cm->mi_stride + xpos / MI_SIZE]
                ->mbmi.skip) {
 #if CONFIG_AOM_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
@@ -167,6 +168,8 @@
 // (Only for luma:)
 // res[1][0]   : (bit count, fb size = 128)
 // res[1][1-3] : strength=1,2,4, fb size = 128
+// res[1][4]   : unfiltered, including skip
+// res[1][5-7] : strength=1,2,4, including skip, fb_size = 128
 // res[2][0]   : (bit count, fb size = 64)
 // res[2][1-3] : strength=1,2,4, fb size = 64
 // res[3][0]   : (bit count, fb size = 32)
@@ -174,9 +177,9 @@
 static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
                     const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
                     unsigned int block_size, unsigned int fb_size_log2, int w,
-                    int h, int64_t res[4][4], int plane) {
+                    int h, int64_t res[4][8], int plane) {
   int c, m, n, filtered = 0;
-  int sum[4];
+  int sum[8];
   const int subx = plane != AOM_PLANE_Y && rec->subsampling_x;
   const int suby = plane != AOM_PLANE_Y && rec->subsampling_y;
   int bslog = get_msb(block_size);
@@ -193,12 +196,12 @@
       plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
   int rec_stride = plane != AOM_PLANE_Y ? rec->uv_stride : rec->y_stride;
   int org_stride = plane != AOM_PLANE_Y ? org->uv_stride : org->y_stride;
-  sum[0] = sum[1] = sum[2] = sum[3] = 0;
+  sum[0] = sum[1] = sum[2] = sum[3] = sum[4] = sum[5] = sum[6] = sum[7] = 0;
   if (plane == AOM_PLANE_Y &&
       fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) {
     int w1, h1, w2, h2, i, sum1, sum2, sum3, oldfiltered;
 
-    fb_size_log2--;
+    filtered = fb_size_log2-- == MAX_FB_SIZE_LOG2;
     w1 = AOMMIN(1 << (fb_size_log2 - bslog), w);
     h1 = AOMMIN(1 << (fb_size_log2 - bslog), h);
     w2 = AOMMIN(w - (1 << (fb_size_log2 - bslog)), w >> 1);
@@ -210,8 +213,8 @@
     oldfiltered = res[i][0];
     res[i][0] = 0;
 
-    filtered = clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1,
-                        res, plane);
+    filtered |= clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1,
+                         res, plane);
     if (1 << (fb_size_log2 - bslog) < w)
       filtered |= clpf_rdo(y, x + (1 << fb_size_log2), rec, org, cm, block_size,
                            fb_size_log2, w2, h1, res, plane);
@@ -223,10 +226,18 @@
                    cm, block_size, fb_size_log2, w2, h2, res, plane);
     }
 
+    // Correct sums for unfiltered blocks
     res[i][1] = AOMMIN(sum1 + res[i][0], res[i][1]);
     res[i][2] = AOMMIN(sum2 + res[i][0], res[i][2]);
     res[i][3] = AOMMIN(sum3 + res[i][0], res[i][3]);
+    if (i == 1) {
+      res[i][5] = AOMMIN(sum1 + res[i][4], res[i][5]);
+      res[i][6] = AOMMIN(sum2 + res[i][4], res[i][6]);
+      res[i][7] = AOMMIN(sum3 + res[i][4], res[i][7]);
+    }
+
     res[i][0] = oldfiltered + filtered;  // Number of signal bits
+
     return filtered;
   }
 
@@ -234,27 +245,28 @@
     for (n = 0; n < w; n++) {
       int xpos = x + n * block_size;
       int ypos = y + m * block_size;
-      if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
-                               (xpos << subx) / MI_SIZE]
-               ->mbmi.skip) {
+      int skip =  // Filtered skip blocks stored only for fb_size == 128
+          4 *
+          !!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
+                                (xpos << subx) / MI_SIZE]
+                ->mbmi.skip;
 #if CONFIG_AOM_HIGHBITDEPTH
-        if (cm->use_highbitdepth) {
-          aom_clpf_detect_multi_hbd(
-              CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer),
-              rec_stride, org_stride, xpos, ypos, rec_width, rec_height, sum,
-              cm->bit_depth - 8, block_size);
-        } else {
-          aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
-                                xpos, ypos, rec_width, rec_height, sum,
-                                block_size);
-        }
-#else
+      if (cm->use_highbitdepth) {
+        aom_clpf_detect_multi_hbd(CONVERT_TO_SHORTPTR(rec_buffer),
+                                  CONVERT_TO_SHORTPTR(org_buffer), rec_stride,
+                                  org_stride, xpos, ypos, rec_width, rec_height,
+                                  sum + skip, cm->bit_depth - 8, block_size);
+      } else {
         aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
-                              xpos, ypos, rec_width, rec_height, sum,
+                              xpos, ypos, rec_width, rec_height, sum + skip,
                               block_size);
-#endif
-        filtered = 1;
       }
+#else
+      aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
+                            xpos, ypos, rec_width, rec_height, sum + skip,
+                            block_size);
+#endif
+      filtered |= !skip;
     }
   }
 
@@ -263,6 +275,12 @@
     res[c][1] += sum[1];
     res[c][2] += sum[2];
     res[c][3] += sum[3];
+    if (c != 1) continue;
+    // Only needed when fb_size == 128
+    res[c][4] += sum[4];
+    res[c][5] += sum[5];
+    res[c][6] += sum[6];
+    res[c][7] += sum[7];
   }
   return filtered;
 }
@@ -271,7 +289,7 @@
                          const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
                          int *best_strength, int *best_bs, int plane) {
   int c, j, k, l;
-  int64_t best, sums[4][4];
+  int64_t best, sums[4][8];
   int width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
   int height = plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
   const int bs = MI_SIZE;
@@ -303,8 +321,14 @@
       }
     }
 
-  if (plane != AOM_PLANE_Y)  // Slightly favour unfiltered chroma
+  // For fb_size == 128 skip blocks are included in the result.
+  if (plane == AOM_PLANE_Y) {
+    sums[1][1] += sums[1][5] - sums[1][4];
+    sums[1][2] += sums[1][6] - sums[1][4];
+    sums[1][3] += sums[1][7] - sums[1][4];
+  } else {  // Slightly favour unfiltered chroma
     sums[0][0] -= sums[0][0] >> 7;
+  }
 
   for (j = 0; j < 4; j++) {
     static const double lambda_square[] = {

diff --git a/av1/encoder/clpf_rdo.h b/av1/encoder/clpf_rdo.h
index bb85fbc..586eed0 100644
--- a/av1/encoder/clpf_rdo.h
+++ b/av1/encoder/clpf_rdo.h

@@ -17,7 +17,7 @@
 int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
                       const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
                       int block_size, int w, int h, unsigned int strength,
-                      unsigned int fb_size_log2, uint8_t *res);
+                      unsigned int fb_size_log2, int8_t *res);
 
 void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
                          const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,

diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 0cb3e50..b4e22ea 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c

@@ -2621,12 +2621,23 @@
 
 #if CONFIG_CLPF
   cm->clpf_strength_y = cm->clpf_strength_u = cm->clpf_strength_v = 0;
-  cm->clpf_size = 2;
-  CHECK_MEM_ERROR(
-      cm, cm->clpf_blocks,
-      aom_malloc(((cm->frame_to_show->y_crop_width + 31) & ~31) *
-                     ((cm->frame_to_show->y_crop_height + 31) & ~31) >>
-                 10));
+  cm->clpf_size = CLPF_64X64;
+
+  // Allocate buffer to hold the status of all filter blocks:
+  // 1 = On, 0 = off, -1 = implicitly off
+  {
+    int size;
+    cm->clpf_stride = ((cm->frame_to_show->y_crop_width + MIN_FB_SIZE - 1) &
+                       ~(MIN_FB_SIZE - 1)) >>
+                      MIN_FB_SIZE_LOG2;
+    size = cm->clpf_stride *
+               ((cm->frame_to_show->y_crop_height + MIN_FB_SIZE - 1) &
+                ~(MIN_FB_SIZE - 1)) >>
+           MIN_FB_SIZE_LOG2;
+    CHECK_MEM_ERROR(cm, cm->clpf_blocks, aom_malloc(size));
+    memset(cm->clpf_blocks, CLPF_NOFLAG, size);
+  }
+
   if (!is_lossless_requested(&cpi->oxcf)) {
     const YV12_BUFFER_CONFIG *const frame = cm->frame_to_show;
 
@@ -2641,20 +2652,18 @@
       // Apply the filter using the chosen strength
       cm->clpf_strength_y = strength_y - (strength_y == 4);
       cm->clpf_size =
-          fb_size_log2 ? fb_size_log2 - get_msb(MAX_FB_SIZE) + 3 : 0;
-      cm->clpf_numblocks = av1_clpf_frame(
-          frame, cpi->Source, cm, !!cm->clpf_size, strength_y,
-          4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, av1_clpf_decision);
+          fb_size_log2 ? fb_size_log2 - MAX_FB_SIZE_LOG2 + 3 : CLPF_NOSIZE;
+      av1_clpf_frame(frame, cpi->Source, cm, cm->clpf_size != CLPF_NOSIZE,
+                     strength_y, 4 + cm->clpf_size, AOM_PLANE_Y,
+                     av1_clpf_decision);
     }
     if (strength_u) {
       cm->clpf_strength_u = strength_u - (strength_u == 4);
-      av1_clpf_frame(frame, NULL, cm, 0, strength_u, 4, NULL, AOM_PLANE_U,
-                     NULL);
+      av1_clpf_frame(frame, NULL, cm, 0, strength_u, 4, AOM_PLANE_U, NULL);
     }
     if (strength_v) {
       cm->clpf_strength_v = strength_v - (strength_v == 4);
-      av1_clpf_frame(frame, NULL, cm, 0, strength_v, 4, NULL, AOM_PLANE_V,
-                     NULL);
+      av1_clpf_frame(frame, NULL, cm, 0, strength_v, 4, AOM_PLANE_V, NULL);
     }
   }
 #endif
commit	85437b211a79804eb668d97aa54375c9ed64f209	[log] [tgz]
author	Steinar Midtskogen <stemidts@cisco.com>	Wed Sep 21 13:38:16 2016 +0200
committer	Steinar Midtskogen <stemidts@cisco.com>	Tue Oct 04 12:55:15 2016 +0000
tree	4acf5b309e8086c855c48d12392e869da05e0412
parent	f041c0ff9a2478acd66056e2f1140fa3bb2f9e3f [diff]