[ext-intra] fix bug in sample interpolation when dy < 0

When the "dy" variable is between -1 and 0 (base2 == -1)
we should interpolate between the upper-left(-1,-1) and left(-1,0)
samples, instead of just taking the left sample.

Coding gain improves a bit with this change.

BUG=aomedia:357

Change-Id: If520f71b22115e79120499c21f8d6925c1fa24be
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index 7cf30eb..d23a81a 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -758,18 +758,13 @@
 #endif  // CONFIG_INTRA_INTERP
       } else {
         base2 = y >> 8;
-        if (base2 >= 0) {
-          shift2 = y & 0xFF;
+        shift2 = y & 0xFF;
 #if CONFIG_INTRA_INTERP
-          val =
-              intra_subpel_interp(base2, shift2, left, 0, bs - 1, filter_type);
+        val = intra_subpel_interp(base2, shift2, left, -1, bs - 1, filter_type);
 #else
-          val = left[base2] * (256 - shift2) + left[base2 + 1] * shift2;
-          val = ROUND_POWER_OF_TWO(val, 8);
+        val = left[base2] * (256 - shift2) + left[base2 + 1] * shift2;
+        val = ROUND_POWER_OF_TWO(val, 8);
 #endif  // CONFIG_INTRA_INTERP
-        } else {
-          val = left[0];
-        }
       }
       dst[c] = clip_pixel(val);
     }
@@ -1053,18 +1048,14 @@
         x = c + 1;
         y = (r << 8) - x * dy;
         base = y >> 8;
-        if (base >= 0) {
-          shift = y & 0xFF;
+        shift = y & 0xFF;
 #if CONFIG_INTRA_INTERP
-          val = highbd_intra_subpel_interp(base, shift, left, 0, bs - 1,
-                                           filter_type);
+        val = highbd_intra_subpel_interp(base, shift, left, -1, bs - 1,
+                                         filter_type);
 #else
-          val = left[base] * (256 - shift) + left[base + 1] * shift;
-          val = ROUND_POWER_OF_TWO(val, 8);
+        val = left[base] * (256 - shift) + left[base + 1] * shift;
+        val = ROUND_POWER_OF_TWO(val, 8);
 #endif  // CONFIG_INTRA_INTERP
-        } else {
-          val = left[0];
-        }
       }
       dst[c] = clip_pixel_highbd(val, bd);
     }
@@ -1559,9 +1550,10 @@
   int i;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  DECLARE_ALIGNED(16, uint16_t, left_col[MAX_TX_SIZE * 2]);
+  DECLARE_ALIGNED(16, uint16_t, left_data[MAX_TX_SIZE * 2 + 16]);
   DECLARE_ALIGNED(16, uint16_t, above_data[MAX_TX_SIZE * 2 + 16]);
   uint16_t *above_row = above_data + 16;
+  uint16_t *left_col = left_data + 16;
   const uint16_t *const_above_row = above_row;
   const int bs = tx_size_wide[tx_size];
   int need_left = extend_modes[mode] & NEED_LEFT;
@@ -1683,6 +1675,7 @@
   if (need_above_left) {
     above_row[-1] =
         n_top_px > 0 ? (n_left_px > 0 ? above_ref[-1] : base + 1) : base - 1;
+    left_col[-1] = above_row[-1];
   }
 
 #if CONFIG_FILTER_INTRA
@@ -1727,10 +1720,11 @@
                                    int n_left_px, int n_bottomleft_px,
                                    int plane) {
   int i;
-  DECLARE_ALIGNED(16, uint8_t, left_col[MAX_TX_SIZE * 2]);
   const uint8_t *above_ref = ref - ref_stride;
+  DECLARE_ALIGNED(16, uint8_t, left_data[MAX_TX_SIZE * 2 + 16]);
   DECLARE_ALIGNED(16, uint8_t, above_data[MAX_TX_SIZE * 2 + 16]);
   uint8_t *above_row = above_data + 16;
+  uint8_t *left_col = left_data + 16;
   const uint8_t *const_above_row = above_row;
   const int bs = tx_size_wide[tx_size];
   int need_left = extend_modes[mode] & NEED_LEFT;
@@ -1850,6 +1844,7 @@
 
   if (need_above_left) {
     above_row[-1] = n_top_px > 0 ? (n_left_px > 0 ? above_ref[-1] : 129) : 127;
+    left_col[-1] = above_row[-1];
   }
 
 #if CONFIG_FILTER_INTRA