Let od_dering() handle 16 to 8 bit conversion Change-Id: Ief5df3d5b1b81f09190d34022a3cb7d500992da2
diff --git a/av1/common/cdef.c b/av1/common/cdef.c index 891c4bd..6faccdc 100644 --- a/av1/common/cdef.c +++ b/av1/common/cdef.c
@@ -78,44 +78,6 @@ return count; } -static INLINE void copy_8x8_16bit_to_8bit(uint8_t *dst, int dstride, - uint16_t *src, int sstride) { - int i, j; - for (i = 0; i < 8; i++) - for (j = 0; j < 8; j++) - dst[i * dstride + j] = (uint8_t)src[i * sstride + j]; -} - -static INLINE void copy_4x4_16bit_to_8bit(uint8_t *dst, int dstride, - uint16_t *src, int sstride) { - int i, j; - for (i = 0; i < 4; i++) - for (j = 0; j < 4; j++) - dst[i * dstride + j] = (uint8_t)src[i * sstride + j]; -} - -/* TODO: Optimize this function for SSE. */ -void copy_dering_16bit_to_8bit(uint8_t *dst, int dstride, uint16_t *src, - dering_list *dlist, int dering_count, - int bsize) { - int bi, bx, by; - if (bsize == 3) { - for (bi = 0; bi < dering_count; bi++) { - by = dlist[bi].by; - bx = dlist[bi].bx; - copy_8x8_16bit_to_8bit(&dst[(by << 3) * dstride + (bx << 3)], dstride, - &src[bi << 2 * bsize], 1 << bsize); - } - } else { - for (bi = 0; bi < dering_count; bi++) { - by = dlist[bi].by; - bx = dlist[bi].bx; - copy_4x4_16bit_to_8bit(&dst[(by << 2) * dstride + (bx << 2)], dstride, - &src[bi << 2 * bsize], 1 << bsize); - } - } -} - /* TODO: Optimize this function for SSE. */ static void copy_sb8_16(UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride, const uint8_t *src, int src_voffset, int src_hoffset, @@ -384,27 +346,28 @@ threshold = level << coeff_shift; if (threshold == 0 && clpf_strength == 0) continue; - od_dering(dst, - &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER], - dec[pli], dir, NULL, var, pli, dlist, dering_count, threshold, - clpf_strength, clpf_damping, coeff_shift, 0); #if CONFIG_AOM_HIGHBITDEPTH if (cm->use_highbitdepth) { - copy_dering_16bit_to_16bit( - &CONVERT_TO_SHORTPTR( - xd->plane[pli] - .dst.buf)[xd->plane[pli].dst.stride * - (MAX_MIB_SIZE * sbr << bsize[pli]) + - (sbc * MAX_MIB_SIZE << bsize[pli])], - xd->plane[pli].dst.stride, dst, dlist, dering_count, - 3 - dec[pli]); + od_dering((uint8_t *)&CONVERT_TO_SHORTPTR( + xd->plane[pli] + .dst.buf)[xd->plane[pli].dst.stride * + (MAX_MIB_SIZE * sbr << bsize[pli]) + + (sbc * MAX_MIB_SIZE << bsize[pli])], + xd->plane[pli].dst.stride, dst, + &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER], + dec[pli], dir, NULL, var, pli, dlist, dering_count, + threshold, clpf_strength, clpf_damping, coeff_shift, 0, 1); } else { #endif - copy_dering_16bit_to_8bit( + od_dering( &xd->plane[pli].dst.buf[xd->plane[pli].dst.stride * (MAX_MIB_SIZE * sbr << bsize[pli]) + (sbc * MAX_MIB_SIZE << bsize[pli])], - xd->plane[pli].dst.stride, dst, dlist, dering_count, bsize[pli]); + xd->plane[pli].dst.stride, dst, + &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER], + dec[pli], dir, NULL, var, pli, dlist, dering_count, threshold, + clpf_strength, clpf_damping, coeff_shift, 0, 0); + #if CONFIG_AOM_HIGHBITDEPTH } #endif
diff --git a/av1/common/od_dering.c b/av1/common/od_dering.c index 65ea4f2..6729676 100644 --- a/av1/common/od_dering.c +++ b/av1/common/od_dering.c
@@ -239,12 +239,50 @@ } } -void od_dering(uint16_t *y, uint16_t *in, int xdec, +static INLINE void copy_8x8_16bit_to_8bit(uint8_t *dst, int dstride, + uint16_t *src, int sstride) { + int i, j; + for (i = 0; i < 8; i++) + for (j = 0; j < 8; j++) + dst[i * dstride + j] = (uint8_t)src[i * sstride + j]; +} + +static INLINE void copy_4x4_16bit_to_8bit(uint8_t *dst, int dstride, + uint16_t *src, int sstride) { + int i, j; + for (i = 0; i < 4; i++) + for (j = 0; j < 4; j++) + dst[i * dstride + j] = (uint8_t)src[i * sstride + j]; +} + +/* TODO: Optimize this function for SSE. */ +static void copy_dering_16bit_to_8bit(uint8_t *dst, int dstride, uint16_t *src, + dering_list *dlist, int dering_count, + int bsize) { + int bi, bx, by; + if (bsize == 3) { + for (bi = 0; bi < dering_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; + copy_8x8_16bit_to_8bit(&dst[(by << 3) * dstride + (bx << 3)], dstride, + &src[bi << 2 * bsize], 1 << bsize); + } + } else { + for (bi = 0; bi < dering_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; + copy_4x4_16bit_to_8bit(&dst[(by << 2) * dstride + (bx << 2)], dstride, + &src[bi << 2 * bsize], 1 << bsize); + } + } +} + +void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int *dirinit, int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli, dering_list *dlist, int dering_count, int threshold, int clpf_strength, int clpf_damping, int coeff_shift, - int skip_dering) { + int skip_dering, int hbd) { int bi; int bx; int by; @@ -292,18 +330,27 @@ } } } - if (!clpf_strength) return; - if (threshold && !skip_dering) - copy_dering_16bit_to_16bit(in, OD_FILT_BSTRIDE, y, dlist, dering_count, - bsize); - for (bi = 0; bi < dering_count; bi++) { - by = dlist[bi].by; - bx = dlist[bi].bx; + if (clpf_strength) { + if (threshold && !skip_dering) + copy_dering_16bit_to_16bit(in, OD_FILT_BSTRIDE, y, dlist, dering_count, + bsize); + for (bi = 0; bi < dering_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; - (!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block_hbd - : aom_clpf_hblock_hbd)( - in, &y[((bi - by) << 2 * bsize) - (bx << bsize)], OD_FILT_BSTRIDE, - 1 << bsize, bx << bsize, by << bsize, 1 << bsize, 1 << bsize, - clpf_strength << coeff_shift, clpf_damping + coeff_shift); + (!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block_hbd + : aom_clpf_hblock_hbd)( + in, &y[((bi - by) << 2 * bsize) - (bx << bsize)], OD_FILT_BSTRIDE, + 1 << bsize, bx << bsize, by << bsize, 1 << bsize, 1 << bsize, + clpf_strength << coeff_shift, clpf_damping + coeff_shift); + } + } + if (dst) { + if (hbd) { + copy_dering_16bit_to_16bit((uint16_t *)dst, dstride, y, dlist, + dering_count, 3 - xdec); + } else { + copy_dering_16bit_to_8bit(dst, dstride, y, dlist, dering_count, bsize); + } } }
diff --git a/av1/common/od_dering.h b/av1/common/od_dering.h index a3efec2..9247785 100644 --- a/av1/common/od_dering.h +++ b/av1/common/od_dering.h
@@ -46,12 +46,12 @@ dering_list *dlist, int dering_count, int bsize); -void od_dering(uint16_t *y, uint16_t *in, int xdec, +void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int *dirinit, int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli, dering_list *dlist, int dering_count, int threshold, int clpf_strength, int clpf_damping, int coeff_shift, - int skip_dering); + int skip_dering, int hbd); int od_filter_dering_direction_4x4_c(uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir);
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c index bcb82b7..f2099fd 100644 --- a/av1/encoder/pickcdef.c +++ b/av1/encoder/pickcdef.c
@@ -263,14 +263,11 @@ src[pli], (sbr * MAX_MIB_SIZE << bsize[pli]) - yoff, (sbc * MAX_MIB_SIZE << bsize[pli]) - xoff, stride[pli], ysize, xsize); - od_dering(tmp_dst, in, dec[pli], dir, &dirinit, var, pli, dlist, + od_dering(clpf_strength ? NULL : (uint8_t *)in, OD_FILT_BSTRIDE, + tmp_dst, in, dec[pli], dir, &dirinit, var, pli, dlist, dering_count, threshold, clpf_strength + (clpf_strength == 3), clpf_damping, - coeff_shift, clpf_strength != 0); - if (clpf_strength == 0) { - copy_dering_16bit_to_16bit(in, OD_FILT_BSTRIDE, tmp_dst, dlist, - dering_count, bsize[pli]); - } + coeff_shift, clpf_strength != 0, 1); mse[pli][sb_count][gi] = compute_dering_mse( ref_coeff[pli] + (sbr * MAX_MIB_SIZE << bsize[pli]) * stride[pli] +