Avoid calculating dering direction and variance more than once

Gives identical bitstream and about 2.5% faster encoding with
used-cpu=4.

Change-Id: Ic7faf2224e51de01bb16af050bb29540862d54ac
diff --git a/av1/common/cdef.c b/av1/common/cdef.c
index e2f5b42..71ce202 100644
--- a/av1/common/cdef.c
+++ b/av1/common/cdef.c
@@ -155,6 +155,7 @@
   unsigned char *row_dering, *prev_row_dering, *curr_row_dering;
   int dering_count;
   int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
+  int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
   int stride;
   int bsize[3];
   int dec[3];
@@ -380,7 +381,7 @@
         if (threshold == 0 && clpf_strength == 0) continue;
         od_dering(dst,
                   &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER],
-                  dec[pli], dir, pli, dlist, dering_count, threshold,
+                  dec[pli], dir, NULL, var, pli, dlist, dering_count, threshold,
                   clpf_strength, clpf_damping, coeff_shift);
 #if CONFIG_AOM_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
diff --git a/av1/common/od_dering.c b/av1/common/od_dering.c
index c7061c4..849c9b1 100644
--- a/av1/common/od_dering.c
+++ b/av1/common/od_dering.c
@@ -240,7 +240,8 @@
 }
 
 void od_dering(uint16_t *y, uint16_t *in, int xdec,
-               int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
+               int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int *dirinit,
+               int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
                dering_list *dlist, int dering_count, int threshold,
                int clpf_strength, int clpf_damping, int coeff_shift) {
   int bi;
@@ -252,12 +253,18 @@
   };
   bsize = OD_DERING_SIZE_LOG2 - xdec;
   if (pli == 0) {
+    if (!dirinit || !*dirinit) {
+      for (bi = 0; bi < dering_count; bi++) {
+        by = dlist[bi].by;
+        bx = dlist[bi].bx;
+        dir[by][bx] = od_dir_find8(&in[8 * by * OD_FILT_BSTRIDE + 8 * bx],
+                                   OD_FILT_BSTRIDE, &var[by][bx], coeff_shift);
+      }
+      if (dirinit) *dirinit = 1;
+    }
     for (bi = 0; bi < dering_count; bi++) {
-      int32_t var;
       by = dlist[bi].by;
       bx = dlist[bi].bx;
-      dir[by][bx] = od_dir_find8(&in[8 * by * OD_FILT_BSTRIDE + 8 * bx],
-                                 OD_FILT_BSTRIDE, &var, coeff_shift);
       /* Deringing orthogonal to the direction uses a tighter threshold
          because we want to be conservative. We've presumably already
          achieved some deringing, so the amount of change is expected
@@ -269,7 +276,7 @@
       (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
           &y[bi << 2 * bsize], 1 << bsize,
           &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
-          od_adjust_thresh(threshold, var), dir[by][bx]);
+          od_adjust_thresh(threshold, var[by][bx]), dir[by][bx]);
     }
   } else {
     for (bi = 0; bi < dering_count; bi++) {
diff --git a/av1/common/od_dering.h b/av1/common/od_dering.h
index 62f0773..d5224f4 100644
--- a/av1/common/od_dering.h
+++ b/av1/common/od_dering.h
@@ -47,7 +47,8 @@
                                 int bsize);
 
 void od_dering(uint16_t *y, uint16_t *in, int xdec,
-               int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
+               int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int *dirinit,
+               int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
                dering_list *dlist, int dering_count, int threshold,
                int clpf_strength, int clpf_damping, int coeff_shift);
 int od_filter_dering_direction_4x4_c(uint16_t *y, int ystride,
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index 96320d3..1655a65 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -100,6 +100,7 @@
   uint16_t *ref_coeff[3];
   dering_list dlist[MAX_MIB_SIZE * MAX_MIB_SIZE];
   int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
+  int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
   int stride[3];
   int bsize[3];
   int dec[3];
@@ -179,6 +180,7 @@
     for (sbc = 0; sbc < nhsb; sbc++) {
       int nvb, nhb;
       int gi;
+      int dirinit = 0;
       DECLARE_ALIGNED(32, uint16_t, dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]);
       DECLARE_ALIGNED(32, uint16_t,
                       tmp_dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]);
@@ -225,9 +227,10 @@
             }
           }
           clpf_strength = gi % CLPF_STRENGTHS;
-          od_dering(tmp_dst, in, dec[pli], dir, pli, dlist, dering_count,
-                    threshold, clpf_strength + (clpf_strength == 3),
-                    clpf_damping, coeff_shift);
+          od_dering(tmp_dst, in, dec[pli], dir, &dirinit, var, pli, dlist,
+                    dering_count, threshold,
+                    clpf_strength + (clpf_strength == 3), clpf_damping,
+                    coeff_shift);
           copy_dering_16bit_to_16bit(dst, MAX_MIB_SIZE << bsize[pli], tmp_dst,
                                      dlist, dering_count, bsize[pli]);
           mse[pli][sb_count][gi] = (int)compute_dist(