Properly initialize intra boundary in tpl model

The tpl model processes the blocks in raster order. This leaves
some blocks using uninitialized pixels from bottom-left. This
commit explicitly extends the bottom-left boundary for all blocks
to prepare for the intra prediction. The coding performance in
speed 1 vbr 150 frames slightly up by 0.05% in lowres and ugc360p.

STATS_CHANGED

Change-Id: Ibe4b8c7256b4b8b4e8e712d052afbc8840a556af
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 6f342c7..f551ce8 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -181,6 +181,28 @@
 
   // Intra prediction search
   xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+
+  // Pre-load the bottom left line.
+  if (xd->left_available &&
+      mi_row + tx_size_high_unit[tx_size] < xd->tile.mi_row_end) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (is_cur_buf_hbd(xd)) {
+      uint16_t *dst = CONVERT_TO_SHORTPTR(dst_buffer);
+      for (int i = 0; i < bw; ++i)
+        dst[(bw + i) * dst_buffer_stride - 1] =
+            dst[(bw - 1) * dst_buffer_stride - 1];
+    } else {
+      for (int i = 0; i < bw; ++i)
+        dst_buffer[(bw + i) * dst_buffer_stride - 1] =
+            dst_buffer[(bw - 1) * dst_buffer_stride - 1];
+    }
+#else
+    for (int i = 0; i < bw; ++i)
+      dst_buffer[(bw + i) * dst_buffer_stride - 1] =
+          dst_buffer[(bw - 1) * dst_buffer_stride - 1];
+#endif
+  }
+
   for (mode = DC_PRED; mode <= PAETH_PRED; ++mode) {
     uint8_t *src;
     uint8_t *dst;