Let od_dering() handle 16 to 8 bit conversion

Change-Id: Ief5df3d5b1b81f09190d34022a3cb7d500992da2
diff --git a/av1/common/cdef.c b/av1/common/cdef.c
index 891c4bd..6faccdc 100644
--- a/av1/common/cdef.c
+++ b/av1/common/cdef.c
@@ -78,44 +78,6 @@
   return count;
 }
 
-static INLINE void copy_8x8_16bit_to_8bit(uint8_t *dst, int dstride,
-                                          uint16_t *src, int sstride) {
-  int i, j;
-  for (i = 0; i < 8; i++)
-    for (j = 0; j < 8; j++)
-      dst[i * dstride + j] = (uint8_t)src[i * sstride + j];
-}
-
-static INLINE void copy_4x4_16bit_to_8bit(uint8_t *dst, int dstride,
-                                          uint16_t *src, int sstride) {
-  int i, j;
-  for (i = 0; i < 4; i++)
-    for (j = 0; j < 4; j++)
-      dst[i * dstride + j] = (uint8_t)src[i * sstride + j];
-}
-
-/* TODO: Optimize this function for SSE. */
-void copy_dering_16bit_to_8bit(uint8_t *dst, int dstride, uint16_t *src,
-                               dering_list *dlist, int dering_count,
-                               int bsize) {
-  int bi, bx, by;
-  if (bsize == 3) {
-    for (bi = 0; bi < dering_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      copy_8x8_16bit_to_8bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
-                             &src[bi << 2 * bsize], 1 << bsize);
-    }
-  } else {
-    for (bi = 0; bi < dering_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      copy_4x4_16bit_to_8bit(&dst[(by << 2) * dstride + (bx << 2)], dstride,
-                             &src[bi << 2 * bsize], 1 << bsize);
-    }
-  }
-}
-
 /* TODO: Optimize this function for SSE. */
 static void copy_sb8_16(UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride,
                         const uint8_t *src, int src_voffset, int src_hoffset,
@@ -384,27 +346,28 @@
 
         threshold = level << coeff_shift;
         if (threshold == 0 && clpf_strength == 0) continue;
-        od_dering(dst,
-                  &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER],
-                  dec[pli], dir, NULL, var, pli, dlist, dering_count, threshold,
-                  clpf_strength, clpf_damping, coeff_shift, 0);
 #if CONFIG_AOM_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
-          copy_dering_16bit_to_16bit(
-              &CONVERT_TO_SHORTPTR(
-                  xd->plane[pli]
-                      .dst.buf)[xd->plane[pli].dst.stride *
-                                    (MAX_MIB_SIZE * sbr << bsize[pli]) +
-                                (sbc * MAX_MIB_SIZE << bsize[pli])],
-              xd->plane[pli].dst.stride, dst, dlist, dering_count,
-              3 - dec[pli]);
+          od_dering((uint8_t *)&CONVERT_TO_SHORTPTR(
+                        xd->plane[pli]
+                            .dst.buf)[xd->plane[pli].dst.stride *
+                                          (MAX_MIB_SIZE * sbr << bsize[pli]) +
+                                      (sbc * MAX_MIB_SIZE << bsize[pli])],
+                    xd->plane[pli].dst.stride, dst,
+                    &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER],
+                    dec[pli], dir, NULL, var, pli, dlist, dering_count,
+                    threshold, clpf_strength, clpf_damping, coeff_shift, 0, 1);
         } else {
 #endif
-          copy_dering_16bit_to_8bit(
+          od_dering(
               &xd->plane[pli].dst.buf[xd->plane[pli].dst.stride *
                                           (MAX_MIB_SIZE * sbr << bsize[pli]) +
                                       (sbc * MAX_MIB_SIZE << bsize[pli])],
-              xd->plane[pli].dst.stride, dst, dlist, dering_count, bsize[pli]);
+              xd->plane[pli].dst.stride, dst,
+              &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER],
+              dec[pli], dir, NULL, var, pli, dlist, dering_count, threshold,
+              clpf_strength, clpf_damping, coeff_shift, 0, 0);
+
 #if CONFIG_AOM_HIGHBITDEPTH
         }
 #endif
diff --git a/av1/common/od_dering.c b/av1/common/od_dering.c
index 65ea4f2..6729676 100644
--- a/av1/common/od_dering.c
+++ b/av1/common/od_dering.c
@@ -239,12 +239,50 @@
   }
 }
 
-void od_dering(uint16_t *y, uint16_t *in, int xdec,
+static INLINE void copy_8x8_16bit_to_8bit(uint8_t *dst, int dstride,
+                                          uint16_t *src, int sstride) {
+  int i, j;
+  for (i = 0; i < 8; i++)
+    for (j = 0; j < 8; j++)
+      dst[i * dstride + j] = (uint8_t)src[i * sstride + j];
+}
+
+static INLINE void copy_4x4_16bit_to_8bit(uint8_t *dst, int dstride,
+                                          uint16_t *src, int sstride) {
+  int i, j;
+  for (i = 0; i < 4; i++)
+    for (j = 0; j < 4; j++)
+      dst[i * dstride + j] = (uint8_t)src[i * sstride + j];
+}
+
+/* TODO: Optimize this function for SSE. */
+static void copy_dering_16bit_to_8bit(uint8_t *dst, int dstride, uint16_t *src,
+                                      dering_list *dlist, int dering_count,
+                                      int bsize) {
+  int bi, bx, by;
+  if (bsize == 3) {
+    for (bi = 0; bi < dering_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      copy_8x8_16bit_to_8bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
+                             &src[bi << 2 * bsize], 1 << bsize);
+    }
+  } else {
+    for (bi = 0; bi < dering_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      copy_4x4_16bit_to_8bit(&dst[(by << 2) * dstride + (bx << 2)], dstride,
+                             &src[bi << 2 * bsize], 1 << bsize);
+    }
+  }
+}
+
+void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
                int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int *dirinit,
                int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
                dering_list *dlist, int dering_count, int threshold,
                int clpf_strength, int clpf_damping, int coeff_shift,
-               int skip_dering) {
+               int skip_dering, int hbd) {
   int bi;
   int bx;
   int by;
@@ -292,18 +330,27 @@
       }
     }
   }
-  if (!clpf_strength) return;
-  if (threshold && !skip_dering)
-    copy_dering_16bit_to_16bit(in, OD_FILT_BSTRIDE, y, dlist, dering_count,
-                               bsize);
-  for (bi = 0; bi < dering_count; bi++) {
-    by = dlist[bi].by;
-    bx = dlist[bi].bx;
+  if (clpf_strength) {
+    if (threshold && !skip_dering)
+      copy_dering_16bit_to_16bit(in, OD_FILT_BSTRIDE, y, dlist, dering_count,
+                                 bsize);
+    for (bi = 0; bi < dering_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
 
-    (!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block_hbd
-                                                    : aom_clpf_hblock_hbd)(
-        in, &y[((bi - by) << 2 * bsize) - (bx << bsize)], OD_FILT_BSTRIDE,
-        1 << bsize, bx << bsize, by << bsize, 1 << bsize, 1 << bsize,
-        clpf_strength << coeff_shift, clpf_damping + coeff_shift);
+      (!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block_hbd
+                                                      : aom_clpf_hblock_hbd)(
+          in, &y[((bi - by) << 2 * bsize) - (bx << bsize)], OD_FILT_BSTRIDE,
+          1 << bsize, bx << bsize, by << bsize, 1 << bsize, 1 << bsize,
+          clpf_strength << coeff_shift, clpf_damping + coeff_shift);
+    }
+  }
+  if (dst) {
+    if (hbd) {
+      copy_dering_16bit_to_16bit((uint16_t *)dst, dstride, y, dlist,
+                                 dering_count, 3 - xdec);
+    } else {
+      copy_dering_16bit_to_8bit(dst, dstride, y, dlist, dering_count, bsize);
+    }
   }
 }
diff --git a/av1/common/od_dering.h b/av1/common/od_dering.h
index a3efec2..9247785 100644
--- a/av1/common/od_dering.h
+++ b/av1/common/od_dering.h
@@ -46,12 +46,12 @@
                                 dering_list *dlist, int dering_count,
                                 int bsize);
 
-void od_dering(uint16_t *y, uint16_t *in, int xdec,
+void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
                int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int *dirinit,
                int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
                dering_list *dlist, int dering_count, int threshold,
                int clpf_strength, int clpf_damping, int coeff_shift,
-               int skip_dering);
+               int skip_dering, int hbd);
 int od_filter_dering_direction_4x4_c(uint16_t *y, int ystride,
                                      const uint16_t *in, int threshold,
                                      int dir);
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index bcb82b7..f2099fd 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -263,14 +263,11 @@
                          src[pli], (sbr * MAX_MIB_SIZE << bsize[pli]) - yoff,
                          (sbc * MAX_MIB_SIZE << bsize[pli]) - xoff, stride[pli],
                          ysize, xsize);
-          od_dering(tmp_dst, in, dec[pli], dir, &dirinit, var, pli, dlist,
+          od_dering(clpf_strength ? NULL : (uint8_t *)in, OD_FILT_BSTRIDE,
+                    tmp_dst, in, dec[pli], dir, &dirinit, var, pli, dlist,
                     dering_count, threshold,
                     clpf_strength + (clpf_strength == 3), clpf_damping,
-                    coeff_shift, clpf_strength != 0);
-          if (clpf_strength == 0) {
-            copy_dering_16bit_to_16bit(in, OD_FILT_BSTRIDE, tmp_dst, dlist,
-                                       dering_count, bsize[pli]);
-          }
+                    coeff_shift, clpf_strength != 0, 1);
           mse[pli][sb_count][gi] = compute_dering_mse(
               ref_coeff[pli] +
                   (sbr * MAX_MIB_SIZE << bsize[pli]) * stride[pli] +