Optimize Wiener filter selection

* Change the behaviour of search_wiener at borders to match
  the behaviour of the Wiener filter itself
* Reorder the calculation in compute_stats, saving ~5% of
  encode time at low bitrates (tested on bus_cif.y4m at 200kbps)

Change-Id: I5f649d77fd66584451aaf37697ce9c9af69524e4
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 1aa0ecf..0fa07c8 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -654,13 +654,19 @@
         M[k] += Y[k] * X;
         H[k * WIENER_WIN2 + k] += Y[k] * Y[k];
         for (l = k + 1; l < WIENER_WIN2; ++l) {
-          double value = Y[k] * Y[l];
-          H[k * WIENER_WIN2 + l] += value;
-          H[l * WIENER_WIN2 + k] += value;
+          // H is a symmetric matrix, so we only need to fill out the upper
+          // triangle here. We can copy it down to the lower triangle outside
+          // the (i, j) loops.
+          H[k * WIENER_WIN2 + l] += Y[k] * Y[l];
         }
       }
     }
   }
+  for (k = 0; k < WIENER_WIN2; ++k) {
+    for (l = k + 1; l < WIENER_WIN2; ++l) {
+      H[l * WIENER_WIN2 + k] = H[k * WIENER_WIN2 + l];
+    }
+  }
 }
 
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -702,13 +708,19 @@
         M[k] += Y[k] * X;
         H[k * WIENER_WIN2 + k] += Y[k] * Y[k];
         for (l = k + 1; l < WIENER_WIN2; ++l) {
-          double value = Y[k] * Y[l];
-          H[k * WIENER_WIN2 + l] += value;
-          H[l * WIENER_WIN2 + k] += value;
+          // H is a symmetric matrix, so we only need to fill out the upper
+          // triangle here. We can copy it down to the lower triangle outside
+          // the (i, j) loops.
+          H[k * WIENER_WIN2 + l] += Y[k] * Y[l];
         }
       }
     }
   }
+  for (k = 0; k < WIENER_WIN2; ++k) {
+    for (l = k + 1; l < WIENER_WIN2; ++l) {
+      H[l * WIENER_WIN2 + k] = H[k * WIENER_WIN2 + l];
+    }
+  }
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
@@ -939,7 +951,6 @@
   const int dgd_stride = dgd->uv_stride;
   double score;
   int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
-  int h_start, h_end, v_start, v_end;
   const int ntiles = av1_get_rest_ntiles(cm->width, cm->height, &tile_width,
                                          &tile_height, &nhtiles, &nvtiles);
 
@@ -963,30 +974,39 @@
   cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
 
   rsi[plane].frame_restoration_type = RESTORE_WIENER;
-  h_start = v_start = WIENER_HALFWIN;
-  h_end = width - WIENER_HALFWIN;
-  v_end = height - WIENER_HALFWIN;
-  if (plane == AOM_PLANE_U) {
+
 #if CONFIG_AOM_HIGHBITDEPTH
-    if (cm->use_highbitdepth)
-      compute_stats_highbd(dgd->u_buffer, src->u_buffer, h_start, h_end,
-                           v_start, v_end, dgd_stride, src_stride, M, H);
-    else
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-      compute_stats(dgd->u_buffer, src->u_buffer, h_start, h_end, v_start,
-                    v_end, dgd_stride, src_stride, M, H);
-  } else if (plane == AOM_PLANE_V) {
-#if CONFIG_AOM_HIGHBITDEPTH
-    if (cm->use_highbitdepth)
-      compute_stats_highbd(dgd->v_buffer, src->v_buffer, h_start, h_end,
-                           v_start, v_end, dgd_stride, src_stride, M, H);
-    else
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-      compute_stats(dgd->v_buffer, src->v_buffer, h_start, h_end, v_start,
-                    v_end, dgd_stride, src_stride, M, H);
+  if (cm->use_highbitdepth) {
+    if (plane == AOM_PLANE_U) {
+      extend_frame_highbd(CONVERT_TO_SHORTPTR(dgd->u_buffer), width, height,
+                          dgd_stride);
+      compute_stats_highbd(dgd->u_buffer, src->u_buffer, 0, width, 0, height,
+                           dgd_stride, src_stride, M, H);
+    } else if (plane == AOM_PLANE_V) {
+      extend_frame_highbd(CONVERT_TO_SHORTPTR(dgd->v_buffer), width, height,
+                          dgd_stride);
+      compute_stats_highbd(dgd->v_buffer, src->v_buffer, 0, width, 0, height,
+                           dgd_stride, src_stride, M, H);
+    } else {
+      assert(0);
+    }
   } else {
-    assert(0);
+#endif
+    if (plane == AOM_PLANE_U) {
+      extend_frame(dgd->u_buffer, width, height, dgd_stride);
+      compute_stats(dgd->u_buffer, src->u_buffer, 0, width, 0, height,
+                    dgd_stride, src_stride, M, H);
+    } else if (plane == AOM_PLANE_V) {
+      extend_frame(dgd->v_buffer, width, height, dgd_stride);
+      compute_stats(dgd->v_buffer, src->v_buffer, 0, width, 0, height,
+                    dgd_stride, src_stride, M, H);
+    } else {
+      assert(0);
+    }
+#if CONFIG_AOM_HIGHBITDEPTH
   }
+#endif
+
   if (!wiener_decompose_sep_sym(M, H, vfilterd, hfilterd)) {
     info->frame_restoration_type = RESTORE_NONE;
     aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
@@ -1080,6 +1100,15 @@
   for (tile_idx = 0; tile_idx < ntiles; ++tile_idx)
     rsi->wiener_info[tile_idx].level = 0;
 
+// Construct a (WIENER_HALFWIN)-pixel border around the frame
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (cm->use_highbitdepth)
+    extend_frame_highbd(CONVERT_TO_SHORTPTR(dgd->y_buffer), width, height,
+                        dgd_stride);
+  else
+#endif
+    extend_frame(dgd->y_buffer, width, height, dgd_stride);
+
   // Compute best Wiener filters for each tile
   for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
     av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
@@ -1093,9 +1122,8 @@
     best_tile_cost[tile_idx] = DBL_MAX;
 
     av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
-                             tile_height, width, height, WIENER_HALFWIN,
-                             WIENER_HALFWIN, &h_start, &h_end, &v_start,
-                             &v_end);
+                             tile_height, width, height, 0, 0, &h_start, &h_end,
+                             &v_start, &v_end);
 #if CONFIG_AOM_HIGHBITDEPTH
     if (cm->use_highbitdepth)
       compute_stats_highbd(dgd->y_buffer, src->y_buffer, h_start, h_end,