Merge "Adding 64x64 forward and inverse transforms" into nextgenv2

commit: deef66db013f83920b911bb7d5d642eaaf1d12c5 [log] [tgz]
author: Debargha Mukherjee <debargha@google.com> Wed Nov 02 18:40:54 2016 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> Wed Nov 02 18:40:55 2016 +0000
tree: e7eb6631d1f1de6a78bc883e3635e21482496456
parent: 1af3d51685d6dbf206dc150309fc5cf047899174 [diff]
parent: 67d134772cfdf217d02eefc627deb3a5f297d1be [diff]
diff --git a/av1/common/dering.c b/av1/common/dering.c
index 4519031..908c588 100644
--- a/av1/common/dering.c
+++ b/av1/common/dering.c

@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+// clang-format off
+
 #include <string.h>
 #include <math.h>
 
@@ -45,19 +47,84 @@
   return skip;
 }
 
+int sb_all_skip_out(const AV1_COMMON *const cm, int mi_row, int mi_col,
+    unsigned char (*bskip)[2], int *count_ptr) {
+  int r, c;
+  int maxc, maxr;
+  int skip = 1;
+  MODE_INFO **grid;
+  int count=0;
+  grid = cm->mi_grid_visible;
+  maxc = cm->mi_cols - mi_col;
+  maxr = cm->mi_rows - mi_row;
+  if (maxr > MAX_MIB_SIZE) maxr = MAX_MIB_SIZE;
+  if (maxc > MAX_MIB_SIZE) maxc = MAX_MIB_SIZE;
+  for (r = 0; r < maxr; r++) {
+    MODE_INFO **grid_row;
+    grid_row = &grid[(mi_row + r) * cm->mi_stride + mi_col];
+    for (c = 0; c < maxc; c++) {
+      if (!grid_row[c]->mbmi.skip) {
+        skip = 0;
+        bskip[count][0] = r;
+        bskip[count][1] = c;
+        count++;
+      }
+    }
+  }
+  *count_ptr = count;
+  return skip;
+}
+
+static INLINE void copy_8x8_16_8bit(uint8_t *dst, int dstride, int16_t *src, int sstride) {
+  int i, j;
+  for (i = 0; i < 8; i++)
+    for (j = 0; j < 8; j++)
+      dst[i * dstride + j] = src[i * sstride + j];
+}
+
+static INLINE void copy_4x4_16_8bit(uint8_t *dst, int dstride, int16_t *src, int sstride) {
+  int i, j;
+  for (i = 0; i < 4; i++)
+    for (j = 0; j < 4; j++)
+      dst[i * dstride + j] = src[i * sstride + j];
+}
+
+/* TODO: Optimize this function for SSE. */
+void copy_blocks_16_8bit(uint8_t *dst, int dstride, int16_t *src, int sstride,
+    unsigned char (*bskip)[2], int dering_count, int bsize)
+{
+  int bi, bx, by;
+  if (bsize == 3) {
+    for (bi = 0; bi < dering_count; bi++) {
+      by = bskip[bi][0];
+      bx = bskip[bi][1];
+      copy_8x8_16_8bit(&dst[(by << 3) * dstride + (bx << 3)],
+                     dstride,
+                     &src[(by << 3) * sstride + (bx << 3)], sstride);
+    }
+  } else {
+    for (bi = 0; bi < dering_count; bi++) {
+      by = bskip[bi][0];
+      bx = bskip[bi][1];
+      copy_4x4_16_8bit(&dst[(by << 2) * dstride + (bx << 2)],
+                     dstride,
+                     &src[(by << 2) * sstride + (bx << 2)], sstride);
+    }
+  }
+}
+
 void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                       MACROBLOCKD *xd, int global_level) {
   int r, c;
   int sbr, sbc;
   int nhsb, nvsb;
   od_dering_in *src[3];
-  unsigned char *bskip;
+  unsigned char bskip[MAX_MIB_SIZE*MAX_MIB_SIZE][2];
+  int dering_count;
   int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
   int stride;
-  int bsize_x[3];
-  int bsize_y[3];
-  int dec_x[3];
-  int dec_y[3];
+  int bsize[3];
+  int dec[3];
   int pli;
   int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
   int nplanes;
@@ -68,19 +135,16 @@
     nplanes = 1;
   nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
   nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
-  bskip = aom_malloc(sizeof(*bskip) * cm->mi_rows * cm->mi_cols);
   av1_setup_dst_planes(xd->plane, frame, 0, 0);
-  for (pli = 0; pli < nplanes; pli++) {
-    dec_x[pli] = xd->plane[pli].subsampling_x;
-    dec_y[pli] = xd->plane[pli].subsampling_y;
-    bsize_x[pli] = 8 >> dec_x[pli];
-    bsize_y[pli] = 8 >> dec_y[pli];
+  for (pli = 0; pli < 3; pli++) {
+    dec[pli] = xd->plane[pli].subsampling_x;
+    bsize[pli] = 8 >> dec[pli];
   }
-  stride = bsize_x[0] * cm->mi_cols;
-  for (pli = 0; pli < nplanes; pli++) {
+  stride = bsize[0] * cm->mi_cols;
+  for (pli = 0; pli < 3; pli++) {
     src[pli] = aom_malloc(sizeof(*src) * cm->mi_rows * cm->mi_cols * 64);
-    for (r = 0; r < bsize_y[pli] * cm->mi_rows; ++r) {
-      for (c = 0; c < bsize_x[pli] * cm->mi_cols; ++c) {
+    for (r = 0; r < bsize[pli] * cm->mi_rows; ++r) {
+      for (c = 0; c < bsize[pli] * cm->mi_cols; ++c) {
 #if CONFIG_AOM_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
           src[pli][r * stride + c] = CONVERT_TO_SHORTPTR(
@@ -95,13 +159,6 @@
       }
     }
   }
-  for (r = 0; r < cm->mi_rows; ++r) {
-    for (c = 0; c < cm->mi_cols; ++c) {
-      const MB_MODE_INFO *mbmi =
-          &cm->mi_grid_visible[r * cm->mi_stride + c]->mbmi;
-      bskip[r * cm->mi_cols + c] = mbmi->skip;
-    }
-  }
   for (sbr = 0; sbr < nvsb; sbr++) {
     for (sbc = 0; sbc < nhsb; sbc++) {
       int level;
@@ -112,7 +169,8 @@
           global_level, cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
                                             MAX_MIB_SIZE * sbc]
                             ->mbmi.dering_gain);
-      if (level == 0 || sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE))
+      if (level == 0 ||
+          sb_all_skip_out(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE, bskip, &dering_count))
         continue;
       for (pli = 0; pli < nplanes; pli++) {
         int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
@@ -124,39 +182,35 @@
         else
           threshold = level << coeff_shift;
         if (threshold == 0) continue;
-        od_dering(dst, MAX_MIB_SIZE * bsize_x[pli],
-                  &src[pli][sbr * stride * bsize_x[pli] * MAX_MIB_SIZE +
-                            sbc * bsize_x[pli] * MAX_MIB_SIZE],
-                  stride, nhb, nvb, sbc, sbr, nhsb, nvsb, dec_x[pli],
-                  dec_y[pli], dir, pli,
-                  &bskip[MAX_MIB_SIZE * sbr * cm->mi_cols + MAX_MIB_SIZE * sbc],
-                  cm->mi_cols, threshold, coeff_shift);
-        for (r = 0; r < bsize_y[pli] * nvb; ++r) {
-          for (c = 0; c < bsize_x[pli] * nhb; ++c) {
+        od_dering(dst, MAX_MIB_SIZE * bsize[pli],
+                  &src[pli][sbr * stride * bsize[pli] * MAX_MIB_SIZE +
+                            sbc * bsize[pli] * MAX_MIB_SIZE],
+                  stride, nhb, nvb, sbc, sbr, nhsb, nvsb, dec[pli], dir, pli,
+                  bskip, dering_count, threshold, coeff_shift);
 #if CONFIG_AOM_HIGHBITDEPTH
-            if (cm->use_highbitdepth) {
-              CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)
-              [xd->plane[pli].dst.stride *
-                   (bsize_x[pli] * MAX_MIB_SIZE * sbr + r) +
-               sbc * bsize_x[pli] * MAX_MIB_SIZE + c] =
-                  dst[r * MAX_MIB_SIZE * bsize_x[pli] + c];
-            } else {
+        if (cm->use_highbitdepth) {
+          copy_blocks_16bit(
+              (int16_t*)&CONVERT_TO_SHORTPTR(
+                  xd->plane[pli].dst.buf)[xd->plane[pli].dst.stride *
+                  (bsize[pli] * MAX_MIB_SIZE * sbr) +
+                  sbc * bsize[pli] * MAX_MIB_SIZE],
+              xd->plane[pli].dst.stride, dst, MAX_MIB_SIZE * bsize[pli], bskip,
+              dering_count, 3 - dec[pli]);
+        } else {
 #endif
-              xd->plane[pli]
-                  .dst.buf[xd->plane[pli].dst.stride *
-                               (bsize_x[pli] * MAX_MIB_SIZE * sbr + r) +
-                           sbc * bsize_x[pli] * MAX_MIB_SIZE + c] =
-                  dst[r * MAX_MIB_SIZE * bsize_x[pli] + c];
+          copy_blocks_16_8bit(
+              &xd->plane[pli].dst.buf[xd->plane[pli].dst.stride *
+                                    (bsize[pli] * MAX_MIB_SIZE * sbr) +
+                                    sbc * bsize[pli] * MAX_MIB_SIZE],
+              xd->plane[pli].dst.stride, dst, MAX_MIB_SIZE * bsize[pli], bskip,
+              dering_count, 3 - dec[pli]);
 #if CONFIG_AOM_HIGHBITDEPTH
-            }
-#endif
-          }
         }
+#endif
       }
     }
   }
   for (pli = 0; pli < nplanes; pli++) {
     aom_free(src[pli]);
   }
-  aom_free(bskip);
 }

diff --git a/av1/common/dering.h b/av1/common/dering.h
index 7c93f8b..c906994 100644
--- a/av1/common/dering.h
+++ b/av1/common/dering.h

@@ -11,6 +11,8 @@
 #ifndef AV1_COMMON_DERING_H_
 #define AV1_COMMON_DERING_H_
 
+// clang-format off
+
 #include "av1/common/od_dering.h"
 #include "av1/common/onyxc_int.h"
 #include "aom/aom_integer.h"
@@ -29,6 +31,8 @@
 
 int compute_level_from_index(int global_level, int gi);
 int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col);
+int sb_all_skip_out(const AV1_COMMON *const cm, int mi_row, int mi_col,
+    unsigned char (*bskip)[2], int *count_ptr);
 void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                       MACROBLOCKD *xd, int global_level);
 

diff --git a/av1/common/od_dering.c b/av1/common/od_dering.c
index 7ed49a4..f19291c 100644
--- a/av1/common/od_dering.c
+++ b/av1/common/od_dering.c

@@ -183,19 +183,6 @@
   return (total_abs + 2) >> 2;
 }
 
-int od_filter_dering_direction_4x8(int16_t *y, int ystride, const int16_t *in,
-                                     int threshold, int dir) {
-  return od_filter_dering_direction_4x4(y, ystride, in, threshold, dir)
-  + od_filter_dering_direction_4x4(y + 4*ystride, ystride,
-                                   in + 4*OD_FILT_BSTRIDE, threshold, dir);
-}
-
-int od_filter_dering_direction_8x4(int16_t *y, int ystride, const int16_t *in,
-                                     int threshold, int dir) {
-  return od_filter_dering_direction_4x4(y, ystride, in, threshold, dir)
-  + od_filter_dering_direction_4x4(y + 4, ystride, in + 4, threshold, dir);
-}
-
 /* Smooth in the direction orthogonal to what was detected. */
 void od_filter_dering_orthogonal_8x8_c(int16_t *y, int ystride,
                                        const int16_t *in, int threshold,
@@ -254,21 +241,6 @@
   }
 }
 
-void od_filter_dering_orthogonal_4x8(int16_t *y, int ystride,
-                                       const int16_t *in, int threshold,
-                                       int dir) {
-  od_filter_dering_orthogonal_4x4(y, ystride, in, threshold, dir);
-  od_filter_dering_orthogonal_4x4(y + 4*ystride, ystride,
-                                  in + 4*OD_FILT_BSTRIDE, threshold, dir);
-}
-
-void od_filter_dering_orthogonal_8x4(int16_t *y, int ystride,
-                                       const int16_t *in, int threshold,
-                                       int dir) {
-  od_filter_dering_orthogonal_4x4(y, ystride, in, threshold, dir);
-  od_filter_dering_orthogonal_4x4(y + 4, ystride, in + 4, threshold, dir);
-}
-
 /* This table approximates x^0.16 with the index being log2(x). It is clamped
    to [-.5, 3]. The table is computed as:
    round(256*min(3, max(.5, 1.08*(sqrt(2)*2.^([0:17]+8)/256/256).^.16))) */
@@ -290,92 +262,116 @@
   return (threshold * OD_THRESH_TABLE_Q8[OD_ILOG(v1)] + 128) >> 8;
 }
 
+static INLINE void copy_8x8_16bit(int16_t *dst, int dstride, int16_t *src, int sstride) {
+  int i, j;
+  for (i = 0; i < 8; i++)
+    for (j = 0; j < 8; j++)
+      dst[i * dstride + j] = src[i * sstride + j];
+}
+
+static INLINE void copy_4x4_16bit(int16_t *dst, int dstride, int16_t *src, int sstride) {
+  int i, j;
+  for (i = 0; i < 4; i++)
+    for (j = 0; j < 4; j++)
+      dst[i * dstride + j] = src[i * sstride + j];
+}
+
+/* TODO: Optimize this function for SSE. */
+void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src, int sstride,
+    unsigned char (*bskip)[2], int dering_count, int bsize)
+{
+  int bi, bx, by;
+  if (bsize == 3) {
+    for (bi = 0; bi < dering_count; bi++) {
+      by = bskip[bi][0];
+      bx = bskip[bi][1];
+      copy_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)],
+                     dstride,
+                     &src[(by << 3) * sstride + (bx << 3)], sstride);
+    }
+  } else {
+    for (bi = 0; bi < dering_count; bi++) {
+      by = bskip[bi][0];
+      bx = bskip[bi][1];
+      copy_4x4_16bit(&dst[(by << 2) * dstride + (bx << 2)],
+                     dstride,
+                     &src[(by << 2) * sstride + (bx << 2)], sstride);
+    }
+  }
+}
+
 void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
                int nhb, int nvb, int sbx, int sby, int nhsb, int nvsb, int xdec,
-               int ydec, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
-               unsigned char *bskip, int skip_stride, int threshold,
+               int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
+               unsigned char (*bskip)[2], int dering_count, int threshold,
                int coeff_shift) {
   int i;
   int j;
+  int bi;
   int bx;
   int by;
   int16_t inbuf[OD_DERING_INBUF_SIZE];
   int16_t *in;
-  int bsize_x = 3 - xdec;
-  int bsize_y = 3 - ydec;
+  int bsize;
   int32_t var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
   int filter2_thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
   od_filter_dering_direction_func filter_dering_direction[OD_DERINGSIZES] = {
-    od_filter_dering_direction_8x8, od_filter_dering_direction_8x4,
-    od_filter_dering_direction_4x8, od_filter_dering_direction_4x4
+    od_filter_dering_direction_4x4, od_filter_dering_direction_8x8
   };
   od_filter_dering_orthogonal_func filter_dering_orthogonal[OD_DERINGSIZES] = {
-    od_filter_dering_orthogonal_8x8, od_filter_dering_orthogonal_8x4,
-    od_filter_dering_orthogonal_4x8, od_filter_dering_orthogonal_4x4
+    od_filter_dering_orthogonal_4x4, od_filter_dering_orthogonal_8x8
   };
-  int filter_idx = xdec*2 + ydec;
+  bsize = 3 - xdec;
   in = inbuf + OD_FILT_BORDER * OD_FILT_BSTRIDE + OD_FILT_BORDER;
   /* We avoid filtering the pixels for which some of the pixels to average
      are outside the frame. We could change the filter instead, but it would
      add special cases for any future vectorization. */
   for (i = 0; i < OD_DERING_INBUF_SIZE; i++) inbuf[i] = OD_DERING_VERY_LARGE;
   for (i = -OD_FILT_BORDER * (sby != 0);
-       i < (nvb << bsize_y) + OD_FILT_BORDER * (sby != nvsb - 1); i++) {
+       i < (nvb << bsize) + OD_FILT_BORDER * (sby != nvsb - 1); i++) {
     for (j = -OD_FILT_BORDER * (sbx != 0);
-         j < (nhb << bsize_x) + OD_FILT_BORDER * (sbx != nhsb - 1); j++) {
+         j < (nhb << bsize) + OD_FILT_BORDER * (sbx != nhsb - 1); j++) {
       in[i * OD_FILT_BSTRIDE + j] = x[i * xstride + j];
     }
   }
-  /* Assume deringing filter is sparsely applied, so do one large copy rather
-     than small copies later if deringing is skipped. */
-  for (i = 0; i < nvb << bsize_y; i++) {
-    for (j = 0; j < nhb << bsize_x; j++) {
-      y[i * ystride + j] = in[i * OD_FILT_BSTRIDE + j];
-    }
-  }
   if (pli == 0) {
-    for (by = 0; by < nvb; by++) {
-      for (bx = 0; bx < nhb; bx++) {
-        if (bskip[by * skip_stride + bx]) continue;
-        dir[by][bx] = od_dir_find8(&x[8 * by * xstride + 8 * bx], xstride,
-                                   &var[by][bx], coeff_shift);
-        /* Deringing orthogonal to the direction uses a tighter threshold
-           because we want to be conservative. We've presumably already
-           achieved some deringing, so the amount of change is expected
-           to be low. Also, since we might be filtering across an edge, we
-           want to make sure not to blur it. That being said, we might want
-           to be a little bit more aggressive on pure horizontal/vertical
-           since the ringing there tends to be directional, so it doesn't
-           get removed by the directional filtering. */
-        filter2_thresh[by][bx] = (filter_dering_direction[filter_idx])(
-            &y[(by * ystride << bsize_y) + (bx << bsize_x)], ystride,
-            &in[(by * OD_FILT_BSTRIDE << bsize_y) + (bx << bsize_x)],
-            od_adjust_thresh(threshold, var[by][bx]), dir[by][bx]);
-      }
+    for (bi = 0; bi < dering_count; bi++) {
+      by = bskip[bi][0];
+      bx = bskip[bi][1];
+      dir[by][bx] = od_dir_find8(&x[8 * by * xstride + 8 * bx], xstride,
+                                 &var[by][bx], coeff_shift);
+      /* Deringing orthogonal to the direction uses a tighter threshold
+         because we want to be conservative. We've presumably already
+         achieved some deringing, so the amount of change is expected
+         to be low. Also, since we might be filtering across an edge, we
+         want to make sure not to blur it. That being said, we might want
+         to be a little bit more aggressive on pure horizontal/vertical
+         since the ringing there tends to be directional, so it doesn't
+         get removed by the directional filtering. */
+      filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
+          &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+          &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
+          od_adjust_thresh(threshold, var[by][bx]), dir[by][bx]);
     }
   } else {
-    for (by = 0; by < nvb; by++) {
-      for (bx = 0; bx < nhb; bx++) {
-        if (bskip[by * skip_stride + bx]) continue;
-        filter2_thresh[by][bx] = (filter_dering_direction[filter_idx])(
-            &y[(by * ystride << bsize_y) + (bx << bsize_x)], ystride,
-            &in[(by * OD_FILT_BSTRIDE << bsize_y) + (bx << bsize_x)], threshold,
-            dir[by][bx]);
-      }
+    for (bi = 0; bi < dering_count; bi++) {
+      by = bskip[bi][0];
+      bx = bskip[bi][1];
+      filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
+          &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+          &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold,
+          dir[by][bx]);
     }
   }
-  for (i = 0; i < nvb << bsize_y; i++) {
-    for (j = 0; j < nhb << bsize_x; j++) {
-      in[i * OD_FILT_BSTRIDE + j] = y[i * ystride + j];
-    }
-  }
-  for (by = 0; by < nvb; by++) {
-    for (bx = 0; bx < nhb; bx++) {
-      if (bskip[by * skip_stride + bx] || filter2_thresh[by][bx] == 0) continue;
-      (filter_dering_orthogonal[filter_idx])(
-          &y[(by * ystride << bsize_y) + (bx << bsize_x)], ystride,
-          &in[(by * OD_FILT_BSTRIDE << bsize_y) + (bx << bsize_x)],
-          filter2_thresh[by][bx], dir[by][bx]);
-    }
+  copy_blocks_16bit(in, OD_FILT_BSTRIDE, y, ystride, bskip, dering_count,
+      bsize);
+  for (bi = 0; bi < dering_count; bi++) {
+    by = bskip[bi][0];
+    bx = bskip[bi][1];
+    if (filter2_thresh[by][bx] == 0) continue;
+    (filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
+        &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+        &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], filter2_thresh[by][bx],
+        dir[by][bx]);
   }
 }

diff --git a/av1/common/od_dering.h b/av1/common/od_dering.h
index fc3a3ef..97090e5 100644
--- a/av1/common/od_dering.h
+++ b/av1/common/od_dering.h

@@ -12,6 +12,8 @@
 #if !defined(_dering_H)
 #define _dering_H (1)
 
+// clang-format off
+
 #include "odintrin.h"
 
 #if defined(DAALA_ODINTRIN)
@@ -19,7 +21,7 @@
 typedef int16_t od_dering_in;
 #endif
 
-#define OD_DERINGSIZES (4)
+#define OD_DERINGSIZES (2)
 
 #define OD_DERING_NBLOCKS (OD_BSIZE_MAX / 8)
 
@@ -34,26 +36,21 @@
 typedef void (*od_filter_dering_orthogonal_func)(int16_t *y, int ystride,
                                                  const int16_t *in,
                                                  int threshold, int dir);
+void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src, int sstride,
+    unsigned char (*bskip)[2], int dering_count, int bsize);
+
 void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
                int nvb, int nhb, int sbx, int sby, int nhsb, int nvsb, int xdec,
-               int ydec, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
-               unsigned char *bskip, int skip_stride, int threshold,
+               int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
+               unsigned char (*bskip)[2], int skip_stride, int threshold,
                int coeff_shift);
 int od_filter_dering_direction_4x4_c(int16_t *y, int ystride, const int16_t *in,
                                      int threshold, int dir);
-int od_filter_dering_direction_4x8(int16_t *y, int ystride, const int16_t *in,
-                                   int threshold, int dir);
-int od_filter_dering_direction_8x4(int16_t *y, int ystride, const int16_t *in,
-                                   int threshold, int dir);
 int od_filter_dering_direction_8x8_c(int16_t *y, int ystride, const int16_t *in,
                                      int threshold, int dir);
 void od_filter_dering_orthogonal_4x4_c(int16_t *y, int ystride,
                                        const int16_t *in, int threshold,
                                        int dir);
-void od_filter_dering_orthogonal_4x8(int16_t *y, int ystride, const int16_t *in,
-                                     int threshold, int dir);
-void od_filter_dering_orthogonal_8x4(int16_t *y, int ystride, const int16_t *in,
-                                     int threshold, int dir);
 void od_filter_dering_orthogonal_8x8_c(int16_t *y, int ystride,
                                        const int16_t *in, int threshold,
                                        int dir);

diff --git a/av1/encoder/pickdering.c b/av1/encoder/pickdering.c
index 6f7767a..0c79e45 100644
--- a/av1/encoder/pickdering.c
+++ b/av1/encoder/pickdering.c

@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+// clang-format off
+
 #include <string.h>
 #include <math.h>
 
@@ -41,7 +43,7 @@
   int nhsb, nvsb;
   od_dering_in *src;
   int16_t *ref_coeff;
-  unsigned char *bskip;
+  unsigned char bskip[MAX_MIB_SIZE*MAX_MIB_SIZE][2];
   int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
   int stride;
   int bsize[3];
@@ -49,10 +51,10 @@
   int pli;
   int level;
   int best_level;
+  int dering_count;
   int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
   src = aom_malloc(sizeof(*src) * cm->mi_rows * cm->mi_cols * 64);
   ref_coeff = aom_malloc(sizeof(*ref_coeff) * cm->mi_rows * cm->mi_cols * 64);
-  bskip = aom_malloc(sizeof(*bskip) * cm->mi_rows * cm->mi_cols);
   av1_setup_dst_planes(xd->plane, frame, 0, 0);
   for (pli = 0; pli < 3; pli++) {
     dec[pli] = xd->plane[pli].subsampling_x;
@@ -77,13 +79,6 @@
 #endif
     }
   }
-  for (r = 0; r < cm->mi_rows; ++r) {
-    for (c = 0; c < cm->mi_cols; ++c) {
-      const MB_MODE_INFO *mbmi =
-          &cm->mi_grid_visible[r * cm->mi_stride + c]->mbmi;
-      bskip[r * cm->mi_cols + c] = mbmi->skip;
-    }
-  }
   nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
   nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
   /* Pick a base threshold based on the quantizer. The threshold will then be
@@ -105,20 +100,28 @@
       int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
       nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
       nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
-      if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue;
+      if (sb_all_skip_out(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE, bskip, &dering_count))
+        continue;
       best_gi = 0;
       for (gi = 0; gi < DERING_REFINEMENT_LEVELS; gi++) {
         int cur_mse;
         int threshold;
         level = compute_level_from_index(best_level, gi);
         threshold = level << coeff_shift;
+        for (r = 0; r < bsize[0] * nvb; r++) {
+          for (c = 0; c < bsize[0] * nhb; c++) {
+            dst[r * MAX_MIB_SIZE * bsize[0] + c] =
+                src[(sbr * bsize[0] * MAX_MIB_SIZE + r) * stride +
+                    sbc * bsize[0] * MAX_MIB_SIZE + c];
+          }
+        }
         od_dering(dst, MAX_MIB_SIZE * bsize[0],
                   &src[sbr * stride * bsize[0] * MAX_MIB_SIZE +
                        sbc * bsize[0] * MAX_MIB_SIZE],
-                  cm->mi_cols * bsize[0], nhb, nvb, sbc, sbr, nhsb, nvsb, 0, 0,
+                  cm->mi_cols * bsize[0], nhb, nvb, sbc, sbr, nhsb, nvsb, 0,
                   dir, 0,
-                  &bskip[MAX_MIB_SIZE * sbr * cm->mi_cols + MAX_MIB_SIZE * sbc],
-                  cm->mi_cols, threshold, coeff_shift);
+                  bskip,
+                  dering_count, threshold, coeff_shift);
         cur_mse = (int)compute_dist(
             dst, MAX_MIB_SIZE * bsize[0],
             &ref_coeff[sbr * stride * bsize[0] * MAX_MIB_SIZE +
@@ -136,6 +139,5 @@
   }
   aom_free(src);
   aom_free(ref_coeff);
-  aom_free(bskip);
   return best_level;
 }
commit	deef66db013f83920b911bb7d5d642eaaf1d12c5	[log] [tgz]
author	Debargha Mukherjee <debargha@google.com>	Wed Nov 02 18:40:54 2016 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	Wed Nov 02 18:40:55 2016 +0000
tree	e7eb6631d1f1de6a78bc883e3635e21482496456
parent	1af3d51685d6dbf206dc150309fc5cf047899174 [diff]
parent	67d134772cfdf217d02eefc627deb3a5f297d1be [diff]