Avoid calculating dering direction and variance more than once
Gives identical bitstream and about 2.5% faster encoding with
used-cpu=4.
Change-Id: Ic7faf2224e51de01bb16af050bb29540862d54ac
diff --git a/av1/common/cdef.c b/av1/common/cdef.c
index e2f5b42..71ce202 100644
--- a/av1/common/cdef.c
+++ b/av1/common/cdef.c
@@ -155,6 +155,7 @@
unsigned char *row_dering, *prev_row_dering, *curr_row_dering;
int dering_count;
int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
+ int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
int stride;
int bsize[3];
int dec[3];
@@ -380,7 +381,7 @@
if (threshold == 0 && clpf_strength == 0) continue;
od_dering(dst,
&src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER],
- dec[pli], dir, pli, dlist, dering_count, threshold,
+ dec[pli], dir, NULL, var, pli, dlist, dering_count, threshold,
clpf_strength, clpf_damping, coeff_shift);
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
diff --git a/av1/common/od_dering.c b/av1/common/od_dering.c
index c7061c4..849c9b1 100644
--- a/av1/common/od_dering.c
+++ b/av1/common/od_dering.c
@@ -240,7 +240,8 @@
}
void od_dering(uint16_t *y, uint16_t *in, int xdec,
- int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
+ int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int *dirinit,
+ int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
dering_list *dlist, int dering_count, int threshold,
int clpf_strength, int clpf_damping, int coeff_shift) {
int bi;
@@ -252,12 +253,18 @@
};
bsize = OD_DERING_SIZE_LOG2 - xdec;
if (pli == 0) {
+ if (!dirinit || !*dirinit) {
+ for (bi = 0; bi < dering_count; bi++) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ dir[by][bx] = od_dir_find8(&in[8 * by * OD_FILT_BSTRIDE + 8 * bx],
+ OD_FILT_BSTRIDE, &var[by][bx], coeff_shift);
+ }
+ if (dirinit) *dirinit = 1;
+ }
for (bi = 0; bi < dering_count; bi++) {
- int32_t var;
by = dlist[bi].by;
bx = dlist[bi].bx;
- dir[by][bx] = od_dir_find8(&in[8 * by * OD_FILT_BSTRIDE + 8 * bx],
- OD_FILT_BSTRIDE, &var, coeff_shift);
/* Deringing orthogonal to the direction uses a tighter threshold
because we want to be conservative. We've presumably already
achieved some deringing, so the amount of change is expected
@@ -269,7 +276,7 @@
(filter_dering_direction[bsize - OD_LOG_BSIZE0])(
&y[bi << 2 * bsize], 1 << bsize,
&in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
- od_adjust_thresh(threshold, var), dir[by][bx]);
+ od_adjust_thresh(threshold, var[by][bx]), dir[by][bx]);
}
} else {
for (bi = 0; bi < dering_count; bi++) {
diff --git a/av1/common/od_dering.h b/av1/common/od_dering.h
index 62f0773..d5224f4 100644
--- a/av1/common/od_dering.h
+++ b/av1/common/od_dering.h
@@ -47,7 +47,8 @@
int bsize);
void od_dering(uint16_t *y, uint16_t *in, int xdec,
- int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
+ int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int *dirinit,
+ int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
dering_list *dlist, int dering_count, int threshold,
int clpf_strength, int clpf_damping, int coeff_shift);
int od_filter_dering_direction_4x4_c(uint16_t *y, int ystride,
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index 96320d3..1655a65 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -100,6 +100,7 @@
uint16_t *ref_coeff[3];
dering_list dlist[MAX_MIB_SIZE * MAX_MIB_SIZE];
int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
+ int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
int stride[3];
int bsize[3];
int dec[3];
@@ -179,6 +180,7 @@
for (sbc = 0; sbc < nhsb; sbc++) {
int nvb, nhb;
int gi;
+ int dirinit = 0;
DECLARE_ALIGNED(32, uint16_t, dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]);
DECLARE_ALIGNED(32, uint16_t,
tmp_dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]);
@@ -225,9 +227,10 @@
}
}
clpf_strength = gi % CLPF_STRENGTHS;
- od_dering(tmp_dst, in, dec[pli], dir, pli, dlist, dering_count,
- threshold, clpf_strength + (clpf_strength == 3),
- clpf_damping, coeff_shift);
+ od_dering(tmp_dst, in, dec[pli], dir, &dirinit, var, pli, dlist,
+ dering_count, threshold,
+ clpf_strength + (clpf_strength == 3), clpf_damping,
+ coeff_shift);
copy_dering_16bit_to_16bit(dst, MAX_MIB_SIZE << bsize[pli], tmp_dst,
dlist, dering_count, bsize[pli]);
mse[pli][sb_count][gi] = (int)compute_dist(