Remove DCT from od_compute_dist_8x8

Cherry-pick Daala e248823a
 Getting rid of the DCT in od_compute_dist_8x8()
Replacing the DCT and frequency weighting by a filter

Change-Id: Icc3a46e5dbb561e4e3b00fa6c2290d54299c05cb
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 83f7c2d..725743c 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -472,12 +472,15 @@
   return (s2 - (sum * sum >> 4)) >> 4;
 }
 
+/* OD_DIST_LP_MID controls the frequency weighting filter used for computing
+   the distortion. For a value X, the filter is [1 X 1]/(X + 2) and
+   is applied both horizontally and vertically. For X=5, the filter is
+   a good approximation for the OD_QM8_Q4_HVS quantization matrix. */
+#define OD_DIST_LP_MID (5)
+#define OD_DIST_LP_NORM (OD_DIST_LP_MID + 2)
+
 static double od_compute_dist_8x8(int qm, int use_activity_masking, od_coeff *x,
-                                  od_coeff *y, int stride) {
-  od_coeff e[8 * 8];
-  od_coeff et[8 * 8];
-  int16_t src[8 * 8];
-  tran_low_t coeff[8 * 8];
+                                  od_coeff *y, od_coeff *e_lp, int stride) {
   double sum;
   int min_var;
   double mean_var;
@@ -487,7 +490,6 @@
   int i;
   int j;
   double vardist;
-  FWD_TXFM_PARAM fwd_txfm_param;
 
   vardist = 0;
   OD_ASSERT(qm != OD_FLAT_QM);
@@ -526,27 +528,11 @@
   sum = 0;
   for (i = 0; i < 8; i++) {
     for (j = 0; j < 8; j++)
-      e[8 * i + j] = x[i * stride + j] - y[i * stride + j];
+      sum += e_lp[i * stride + j] * (double)e_lp[i * stride + j];
   }
-
-  for (i = 0; i < 8; i++)
-    for (j = 0; j < 8; j++) src[8 * i + j] = e[8 * i + j];
-
-  fwd_txfm_param.tx_type = DCT_DCT;
-  fwd_txfm_param.tx_size = TX_8X8;
-  fwd_txfm_param.lossless = 0;
-
-  fwd_txfm(&src[0], &coeff[0], 8, &fwd_txfm_param);
-
-  for (i = 0; i < 8; i++)
-    for (j = 0; j < 8; j++) et[8 * i + j] = coeff[8 * i + j] >> 3;
-
-  sum = 0;
-  for (i = 0; i < 8; i++)
-    for (j = 0; j < 8; j++)
-      sum += et[8 * i + j] * (double)et[8 * i + j] * 16. /
-             OD_QM8_Q4_HVS[i * 8 + j];
-
+  /* Normalize the filter to unit DC response. */
+  sum *= 1. / (OD_DIST_LP_NORM * OD_DIST_LP_NORM * OD_DIST_LP_NORM *
+               OD_DIST_LP_NORM);
   return activity * activity * (sum + vardist);
 }
 
@@ -569,11 +555,43 @@
       sum += tmp * tmp;
     }
   } else {
+    int j;
+    DECLARE_ALIGNED(16, od_coeff, e[MAX_TX_SQUARE]);
+    DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
+    DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
+    int mid = OD_DIST_LP_MID;
+    for (i = 0; i < bsize_h; i++) {
+      for (j = 0; j < bsize_w; j++) {
+        e[i * bsize_w + j] = x[i * bsize_w + j] - y[i * bsize_w + j];
+      }
+    }
+    for (i = 0; i < bsize_h; i++) {
+      tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
+      tmp[i * bsize_w + bsize_w - 1] =
+          mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
+      for (j = 1; j < bsize_w - 1; j++) {
+        tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] +
+                               e[i * bsize_w + j - 1] + e[i * bsize_w + j + 1];
+      }
+    }
+    for (j = 0; j < bsize_w; j++) {
+      e_lp[j] = mid * tmp[j] + 2 * tmp[bsize_w + j];
+      e_lp[(bsize_h - 1) * bsize_w + j] =
+          mid * tmp[(bsize_h - 1) * bsize_w + j] +
+          2 * tmp[(bsize_h - 2) * bsize_w + j];
+    }
+    for (i = 1; i < bsize_h - 1; i++) {
+      for (j = 0; j < bsize_w; j++) {
+        e_lp[i * bsize_w + j] = mid * tmp[i * bsize_w + j] +
+                                tmp[(i - 1) * bsize_w + j] +
+                                tmp[(i + 1) * bsize_w + j];
+      }
+    }
     for (i = 0; i < bsize_h; i += 8) {
-      int j;
       for (j = 0; j < bsize_w; j += 8) {
         sum += od_compute_dist_8x8(qm, activity_masking, &x[i * bsize_w + j],
-                                   &y[i * bsize_w + j], bsize_w);
+                                   &y[i * bsize_w + j], &e_lp[i * bsize_w + j],
+                                   bsize_w);
       }
     }
     /* Compensate for the fact that the quantization matrix lowers the