De-sparsifying the deringing output buffer
No change in output
Change-Id: I940203975564aedca8734d6f74b013edb513f517
diff --git a/av1/common/dering.c b/av1/common/dering.c
index 908c588..afdaa69 100644
--- a/av1/common/dering.c
+++ b/av1/common/dering.c
@@ -90,7 +90,7 @@
}
/* TODO: Optimize this function for SSE. */
-void copy_blocks_16_8bit(uint8_t *dst, int dstride, int16_t *src, int sstride,
+void copy_blocks_16_8bit(uint8_t *dst, int dstride, int16_t *src,
unsigned char (*bskip)[2], int dering_count, int bsize)
{
int bi, bx, by;
@@ -100,7 +100,7 @@
bx = bskip[bi][1];
copy_8x8_16_8bit(&dst[(by << 3) * dstride + (bx << 3)],
dstride,
- &src[(by << 3) * sstride + (bx << 3)], sstride);
+ &src[bi << 2*bsize], 1 << bsize);
}
} else {
for (bi = 0; bi < dering_count; bi++) {
@@ -108,7 +108,7 @@
bx = bskip[bi][1];
copy_4x4_16_8bit(&dst[(by << 2) * dstride + (bx << 2)],
dstride,
- &src[(by << 2) * sstride + (bx << 2)], sstride);
+ &src[bi << 2*bsize], 1 << bsize);
}
}
}
@@ -182,7 +182,7 @@
else
threshold = level << coeff_shift;
if (threshold == 0) continue;
- od_dering(dst, MAX_MIB_SIZE * bsize[pli],
+ od_dering(dst,
&src[pli][sbr * stride * bsize[pli] * MAX_MIB_SIZE +
sbc * bsize[pli] * MAX_MIB_SIZE],
stride, nhb, nvb, sbc, sbr, nhsb, nvsb, dec[pli], dir, pli,
@@ -194,7 +194,7 @@
xd->plane[pli].dst.buf)[xd->plane[pli].dst.stride *
(bsize[pli] * MAX_MIB_SIZE * sbr) +
sbc * bsize[pli] * MAX_MIB_SIZE],
- xd->plane[pli].dst.stride, dst, MAX_MIB_SIZE * bsize[pli], bskip,
+ xd->plane[pli].dst.stride, dst, bskip,
dering_count, 3 - dec[pli]);
} else {
#endif
@@ -202,7 +202,7 @@
&xd->plane[pli].dst.buf[xd->plane[pli].dst.stride *
(bsize[pli] * MAX_MIB_SIZE * sbr) +
sbc * bsize[pli] * MAX_MIB_SIZE],
- xd->plane[pli].dst.stride, dst, MAX_MIB_SIZE * bsize[pli], bskip,
+ xd->plane[pli].dst.stride, dst, bskip,
dering_count, 3 - dec[pli]);
#if CONFIG_AOM_HIGHBITDEPTH
}
diff --git a/av1/common/od_dering.c b/av1/common/od_dering.c
index 2fc06ea..f86335d 100644
--- a/av1/common/od_dering.c
+++ b/av1/common/od_dering.c
@@ -277,7 +277,7 @@
}
/* TODO: Optimize this function for SSE. */
-void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src, int sstride,
+void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src,
unsigned char (*bskip)[2], int dering_count, int bsize)
{
int bi, bx, by;
@@ -287,7 +287,7 @@
bx = bskip[bi][1];
copy_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)],
dstride,
- &src[(by << 3) * sstride + (bx << 3)], sstride);
+ &src[bi << 2*bsize], 1 << bsize);
}
} else {
for (bi = 0; bi < dering_count; bi++) {
@@ -295,12 +295,12 @@
bx = bskip[bi][1];
copy_4x4_16bit(&dst[(by << 2) * dstride + (bx << 2)],
dstride,
- &src[(by << 2) * sstride + (bx << 2)], sstride);
+ &src[bi << 2*bsize], 1 << bsize);
}
}
}
-void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
+void od_dering(int16_t *y, const od_dering_in *x, int xstride,
int nhb, int nvb, int sbx, int sby, int nhsb, int nvsb, int xdec,
int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
unsigned char (*bskip)[2], int dering_count, int threshold,
@@ -349,7 +349,7 @@
since the ringing there tends to be directional, so it doesn't
get removed by the directional filtering. */
filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
- &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+ &y[bi << 2*bsize], 1 << bsize,
&in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
od_adjust_thresh(threshold, var), dir[by][bx]);
}
@@ -358,19 +358,19 @@
by = bskip[bi][0];
bx = bskip[bi][1];
filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
- &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+ &y[bi << 2*bsize], 1 << bsize,
&in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold,
dir[by][bx]);
}
}
- copy_blocks_16bit(in, OD_FILT_BSTRIDE, y, ystride, bskip, dering_count,
+ copy_blocks_16bit(in, OD_FILT_BSTRIDE, y, bskip, dering_count,
bsize);
for (bi = 0; bi < dering_count; bi++) {
by = bskip[bi][0];
bx = bskip[bi][1];
if (filter2_thresh[by][bx] == 0) continue;
(filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
- &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+ &y[bi << 2*bsize], 1 << bsize,
&in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], filter2_thresh[by][bx],
dir[by][bx]);
}
diff --git a/av1/common/od_dering.h b/av1/common/od_dering.h
index 97090e5..5ec9027 100644
--- a/av1/common/od_dering.h
+++ b/av1/common/od_dering.h
@@ -36,10 +36,10 @@
typedef void (*od_filter_dering_orthogonal_func)(int16_t *y, int ystride,
const int16_t *in,
int threshold, int dir);
-void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src, int sstride,
+void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src,
unsigned char (*bskip)[2], int dering_count, int bsize);
-void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
+void od_dering(int16_t *y, const od_dering_in *x, int xstride,
int nvb, int nhb, int sbx, int sby, int nhsb, int nvsb, int xdec,
int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
unsigned char (*bskip)[2], int skip_stride, int threshold,
diff --git a/av1/encoder/pickdering.c b/av1/encoder/pickdering.c
index 0c79e45..a503dd9 100644
--- a/av1/encoder/pickdering.c
+++ b/av1/encoder/pickdering.c
@@ -98,6 +98,7 @@
int best_gi;
int32_t best_mse = INT32_MAX;
int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
+ int16_t tmp_dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
if (sb_all_skip_out(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE, bskip, &dering_count))
@@ -115,13 +116,15 @@
sbc * bsize[0] * MAX_MIB_SIZE + c];
}
}
- od_dering(dst, MAX_MIB_SIZE * bsize[0],
+ od_dering(tmp_dst,
&src[sbr * stride * bsize[0] * MAX_MIB_SIZE +
sbc * bsize[0] * MAX_MIB_SIZE],
cm->mi_cols * bsize[0], nhb, nvb, sbc, sbr, nhsb, nvsb, 0,
dir, 0,
bskip,
dering_count, threshold, coeff_shift);
+ copy_blocks_16bit(dst, MAX_MIB_SIZE * bsize[0], tmp_dst, bskip,
+ dering_count, 3);
cur_mse = (int)compute_dist(
dst, MAX_MIB_SIZE * bsize[0],
&ref_coeff[sbr * stride * bsize[0] * MAX_MIB_SIZE +