De-sparsifying the deringing output buffer

No change in output

Change-Id: I940203975564aedca8734d6f74b013edb513f517
diff --git a/av1/common/dering.c b/av1/common/dering.c
index 908c588..afdaa69 100644
--- a/av1/common/dering.c
+++ b/av1/common/dering.c
@@ -90,7 +90,7 @@
 }
 
 /* TODO: Optimize this function for SSE. */
-void copy_blocks_16_8bit(uint8_t *dst, int dstride, int16_t *src, int sstride,
+void copy_blocks_16_8bit(uint8_t *dst, int dstride, int16_t *src,
     unsigned char (*bskip)[2], int dering_count, int bsize)
 {
   int bi, bx, by;
@@ -100,7 +100,7 @@
       bx = bskip[bi][1];
       copy_8x8_16_8bit(&dst[(by << 3) * dstride + (bx << 3)],
                      dstride,
-                     &src[(by << 3) * sstride + (bx << 3)], sstride);
+                     &src[bi << 2*bsize], 1 << bsize);
     }
   } else {
     for (bi = 0; bi < dering_count; bi++) {
@@ -108,7 +108,7 @@
       bx = bskip[bi][1];
       copy_4x4_16_8bit(&dst[(by << 2) * dstride + (bx << 2)],
                      dstride,
-                     &src[(by << 2) * sstride + (bx << 2)], sstride);
+                     &src[bi << 2*bsize], 1 << bsize);
     }
   }
 }
@@ -182,7 +182,7 @@
         else
           threshold = level << coeff_shift;
         if (threshold == 0) continue;
-        od_dering(dst, MAX_MIB_SIZE * bsize[pli],
+        od_dering(dst,
                   &src[pli][sbr * stride * bsize[pli] * MAX_MIB_SIZE +
                             sbc * bsize[pli] * MAX_MIB_SIZE],
                   stride, nhb, nvb, sbc, sbr, nhsb, nvsb, dec[pli], dir, pli,
@@ -194,7 +194,7 @@
                   xd->plane[pli].dst.buf)[xd->plane[pli].dst.stride *
                   (bsize[pli] * MAX_MIB_SIZE * sbr) +
                   sbc * bsize[pli] * MAX_MIB_SIZE],
-              xd->plane[pli].dst.stride, dst, MAX_MIB_SIZE * bsize[pli], bskip,
+              xd->plane[pli].dst.stride, dst, bskip,
               dering_count, 3 - dec[pli]);
         } else {
 #endif
@@ -202,7 +202,7 @@
               &xd->plane[pli].dst.buf[xd->plane[pli].dst.stride *
                                     (bsize[pli] * MAX_MIB_SIZE * sbr) +
                                     sbc * bsize[pli] * MAX_MIB_SIZE],
-              xd->plane[pli].dst.stride, dst, MAX_MIB_SIZE * bsize[pli], bskip,
+              xd->plane[pli].dst.stride, dst, bskip,
               dering_count, 3 - dec[pli]);
 #if CONFIG_AOM_HIGHBITDEPTH
         }
diff --git a/av1/common/od_dering.c b/av1/common/od_dering.c
index 2fc06ea..f86335d 100644
--- a/av1/common/od_dering.c
+++ b/av1/common/od_dering.c
@@ -277,7 +277,7 @@
 }
 
 /* TODO: Optimize this function for SSE. */
-void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src, int sstride,
+void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src,
     unsigned char (*bskip)[2], int dering_count, int bsize)
 {
   int bi, bx, by;
@@ -287,7 +287,7 @@
       bx = bskip[bi][1];
       copy_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)],
                      dstride,
-                     &src[(by << 3) * sstride + (bx << 3)], sstride);
+                     &src[bi << 2*bsize], 1 << bsize);
     }
   } else {
     for (bi = 0; bi < dering_count; bi++) {
@@ -295,12 +295,12 @@
       bx = bskip[bi][1];
       copy_4x4_16bit(&dst[(by << 2) * dstride + (bx << 2)],
                      dstride,
-                     &src[(by << 2) * sstride + (bx << 2)], sstride);
+                     &src[bi << 2*bsize], 1 << bsize);
     }
   }
 }
 
-void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
+void od_dering(int16_t *y, const od_dering_in *x, int xstride,
                int nhb, int nvb, int sbx, int sby, int nhsb, int nvsb, int xdec,
                int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
                unsigned char (*bskip)[2], int dering_count, int threshold,
@@ -349,7 +349,7 @@
          since the ringing there tends to be directional, so it doesn't
          get removed by the directional filtering. */
       filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
-          &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+          &y[bi << 2*bsize], 1 << bsize,
           &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
           od_adjust_thresh(threshold, var), dir[by][bx]);
     }
@@ -358,19 +358,19 @@
       by = bskip[bi][0];
       bx = bskip[bi][1];
       filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
-          &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+          &y[bi << 2*bsize], 1 << bsize,
           &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold,
           dir[by][bx]);
     }
   }
-  copy_blocks_16bit(in, OD_FILT_BSTRIDE, y, ystride, bskip, dering_count,
+  copy_blocks_16bit(in, OD_FILT_BSTRIDE, y, bskip, dering_count,
       bsize);
   for (bi = 0; bi < dering_count; bi++) {
     by = bskip[bi][0];
     bx = bskip[bi][1];
     if (filter2_thresh[by][bx] == 0) continue;
     (filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
-        &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+        &y[bi << 2*bsize], 1 << bsize,
         &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], filter2_thresh[by][bx],
         dir[by][bx]);
   }
diff --git a/av1/common/od_dering.h b/av1/common/od_dering.h
index 97090e5..5ec9027 100644
--- a/av1/common/od_dering.h
+++ b/av1/common/od_dering.h
@@ -36,10 +36,10 @@
 typedef void (*od_filter_dering_orthogonal_func)(int16_t *y, int ystride,
                                                  const int16_t *in,
                                                  int threshold, int dir);
-void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src, int sstride,
+void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src,
     unsigned char (*bskip)[2], int dering_count, int bsize);
 
-void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
+void od_dering(int16_t *y, const od_dering_in *x, int xstride,
                int nvb, int nhb, int sbx, int sby, int nhsb, int nvsb, int xdec,
                int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
                unsigned char (*bskip)[2], int skip_stride, int threshold,
diff --git a/av1/encoder/pickdering.c b/av1/encoder/pickdering.c
index 0c79e45..a503dd9 100644
--- a/av1/encoder/pickdering.c
+++ b/av1/encoder/pickdering.c
@@ -98,6 +98,7 @@
       int best_gi;
       int32_t best_mse = INT32_MAX;
       int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
+      int16_t tmp_dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
       nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
       nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
       if (sb_all_skip_out(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE, bskip, &dering_count))
@@ -115,13 +116,15 @@
                     sbc * bsize[0] * MAX_MIB_SIZE + c];
           }
         }
-        od_dering(dst, MAX_MIB_SIZE * bsize[0],
+        od_dering(tmp_dst,
                   &src[sbr * stride * bsize[0] * MAX_MIB_SIZE +
                        sbc * bsize[0] * MAX_MIB_SIZE],
                   cm->mi_cols * bsize[0], nhb, nvb, sbc, sbr, nhsb, nvsb, 0,
                   dir, 0,
                   bskip,
                   dering_count, threshold, coeff_shift);
+        copy_blocks_16bit(dst, MAX_MIB_SIZE * bsize[0], tmp_dst, bskip,
+            dering_count, 3);
         cur_mse = (int)compute_dist(
             dst, MAX_MIB_SIZE * bsize[0],
             &ref_coeff[sbr * stride * bsize[0] * MAX_MIB_SIZE +