Support 64x64 intra prediction

Change-Id: I2536b5b55f28c2ee59445c3b70d3e073e69945cd
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 6d230d2..c15316b 100644
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -314,6 +314,62 @@
 add_proto qw/void aom_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_dc_128_predictor_32x32 msa neon sse2/;
 
+if ((aom_config("CONFIG_TX64X64") eq "yes")) {
+  add_proto qw/void aom_d207_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/aom_d207_predictor_64x64/;
+
+  add_proto qw/void aom_d207e_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/aom_d207e_predictor_64x64/;
+
+  add_proto qw/void aom_d45_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/aom_d45_predictor_64x64/;
+
+  add_proto qw/void aom_d45e_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/aom_d45e_predictor_64x64/;
+
+  add_proto qw/void aom_d63_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/aom_d63_predictor_64x64/;
+
+  add_proto qw/void aom_d63e_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/aom_d63e_predictor_64x64/;
+
+  add_proto qw/void aom_h_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/aom_h_predictor_64x64/;
+
+  add_proto qw/void aom_d117_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/aom_d117_predictor_64x64/;
+
+  add_proto qw/void aom_d135_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/aom_d135_predictor_64x64/;
+
+  add_proto qw/void aom_d153_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/aom_d153_predictor_64x64/;
+
+  add_proto qw/void aom_v_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/aom_v_predictor_64x64/;
+
+  if ((aom_config("CONFIG_ALT_INTRA") eq "yes")) {
+    add_proto qw/void aom_paeth_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+    specialize qw/aom_paeth_predictor_64x64/;
+  } else {
+    add_proto qw/void aom_tm_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+    specialize qw/aom_tm_predictor_64x64/;
+  }  # CONFIG_ALT_INTRA
+
+
+  add_proto qw/void aom_dc_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/aom_dc_predictor_64x64/;
+
+  add_proto qw/void aom_dc_top_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/aom_dc_top_predictor_64x64/;
+
+  add_proto qw/void aom_dc_left_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/aom_dc_left_predictor_64x64/;
+
+  add_proto qw/void aom_dc_128_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/aom_dc_128_predictor_64x64/;
+}
+
 # High bitdepth functions
 if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void aom_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@@ -524,6 +580,60 @@
 
   add_proto qw/void aom_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_dc_128_predictor_32x32/;
+
+  if ((aom_config("CONFIG_TX64X64") eq "yes")) {
+    add_proto qw/void aom_highbd_d207_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/aom_highbd_d207_predictor_64x64/;
+
+    add_proto qw/void aom_highbd_d207e_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/aom_highbd_d207e_predictor_64x64/;
+
+    add_proto qw/void aom_highbd_d45_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/aom_highbd_d45_predictor_64x64/;
+
+    add_proto qw/void aom_highbd_d45e_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/aom_highbd_d45e_predictor_64x64/;
+
+    add_proto qw/void aom_highbd_d63_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/aom_highbd_d63_predictor_64x64/;
+
+    add_proto qw/void aom_highbd_d63e_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/aom_highbd_d63e_predictor_64x64/;
+
+    add_proto qw/void aom_highbd_h_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/aom_highbd_h_predictor_64x64/;
+
+    add_proto qw/void aom_highbd_d117_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/aom_highbd_d117_predictor_64x64/;
+
+    add_proto qw/void aom_highbd_d135_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/aom_highbd_d135_predictor_64x64/;
+
+    add_proto qw/void aom_highbd_d153_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/aom_highbd_d153_predictor_64x64/;
+
+    add_proto qw/void aom_highbd_v_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/aom_highbd_v_predictor_64x64/;
+
+    if ((aom_config("CONFIG_ALT_INTRA") eq "yes")) {
+      add_proto qw/void aom_highbd_paeth_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    } else {
+      add_proto qw/void aom_highbd_tm_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+      specialize qw/aom_highbd_tm_predictor_64x64/;
+    }  # CONFIG_ALT_INTRA
+
+    add_proto qw/void aom_highbd_dc_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/aom_highbd_dc_predictor_64x64/;
+
+    add_proto qw/void aom_highbd_dc_top_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/aom_highbd_dc_top_predictor_64x64/;
+
+    add_proto qw/void aom_highbd_dc_left_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/aom_highbd_dc_left_predictor_64x64/;
+
+    add_proto qw/void aom_highbd_dc_128_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/aom_highbd_dc_128_predictor_64x64/;
+  }
 }  # CONFIG_AOM_HIGHBITDEPTH
 
 #
diff --git a/aom_dsp/intrapred.c b/aom_dsp/intrapred.c
index 29b5a74..1307415 100644
--- a/aom_dsp/intrapred.c
+++ b/aom_dsp/intrapred.c
@@ -149,6 +149,15 @@
 static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
   int i;
+#if CONFIG_TX64X64
+#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7
+  // silence a spurious -Warray-bounds warning, possibly related to:
+  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273
+  uint8_t border[133];
+#else
+  uint8_t border[64 + 64 - 1];  // outer border from bottom-left to top-right
+#endif
+#else
 #if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7
   // silence a spurious -Warray-bounds warning, possibly related to:
   // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273
@@ -156,6 +165,7 @@
 #else
   uint8_t border[32 + 32 - 1];  // outer border from bottom-left to top-right
 #endif
+#endif  // CONFIG_TX64X64
 
   // dst(bs, bs - 2)[0], i.e., border starting at bottom-left
   for (i = 0; i < bs - 2; ++i) {
@@ -965,6 +975,31 @@
   }
 
 /* clang-format off */
+#if CONFIG_TX64X64
+#define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 2) \
+  intra_pred_sized(type, 4) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32) \
+  intra_pred_sized(type, 64) \
+  intra_pred_highbd_sized(type, 4) \
+  intra_pred_highbd_sized(type, 8) \
+  intra_pred_highbd_sized(type, 16) \
+  intra_pred_highbd_sized(type, 32) \
+  intra_pred_highbd_sized(type, 64)
+
+#define intra_pred_above_4x4(type) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32) \
+  intra_pred_sized(type, 64) \
+  intra_pred_highbd_sized(type, 4) \
+  intra_pred_highbd_sized(type, 8) \
+  intra_pred_highbd_sized(type, 16) \
+  intra_pred_highbd_sized(type, 32) \
+  intra_pred_highbd_sized(type, 64)
+#else  // CONFIG_TX64X64
 #define intra_pred_allsizes(type) \
   intra_pred_sized(type, 2) \
   intra_pred_sized(type, 4) \
@@ -984,8 +1019,25 @@
   intra_pred_highbd_sized(type, 8) \
   intra_pred_highbd_sized(type, 16) \
   intra_pred_highbd_sized(type, 32)
+#endif  // CONFIG_TX64X64
 
 #else
+
+#if CONFIG_TX64X64
+#define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 2) \
+  intra_pred_sized(type, 4) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32) \
+  intra_pred_sized(type, 64)
+
+#define intra_pred_above_4x4(type) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32) \
+  intra_pred_sized(type, 64)
+#else  // CONFIG_TX64X64
 #define intra_pred_allsizes(type) \
   intra_pred_sized(type, 2) \
   intra_pred_sized(type, 4) \
@@ -997,6 +1049,7 @@
   intra_pred_sized(type, 8) \
   intra_pred_sized(type, 16) \
   intra_pred_sized(type, 32)
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 intra_pred_above_4x4(d207)
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index 693ad80..809e9b4 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -322,10 +322,18 @@
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 static void av1_init_intra_predictors_internal(void) {
+#if CONFIG_TX64X64
+#define INIT_NO_4X4(p, type)                  \
+  p[TX_8X8] = aom_##type##_predictor_8x8;     \
+  p[TX_16X16] = aom_##type##_predictor_16x16; \
+  p[TX_32X32] = aom_##type##_predictor_32x32; \
+  p[TX_64X64] = aom_##type##_predictor_64x64
+#else
 #define INIT_NO_4X4(p, type)                  \
   p[TX_8X8] = aom_##type##_predictor_8x8;     \
   p[TX_16X16] = aom_##type##_predictor_16x16; \
   p[TX_32X32] = aom_##type##_predictor_32x32
+#endif  // CONFIG_TX64X64
 
 #define INIT_ALL_SIZES(p, type)           \
   p[TX_4X4] = aom_##type##_predictor_4x4; \
@@ -910,17 +918,46 @@
       { 589, 646, -495, 255 },
       { 740, 884, -728, 77 },
   },
+#if CONFIG_TX64X64
+  {
+      { 477, 737, -393, 150 },
+      { 881, 630, -546, 67 },
+      { 506, 984, -443, -20 },
+      { 114, 459, -270, 528 },
+      { 433, 528, 14, 3 },
+      { 837, 470, -301, -30 },
+      { 181, 777, 89, -107 },
+      { -29, 716, -232, 259 },
+      { 589, 646, -495, 255 },
+      { 740, 884, -728, 77 },
+  },
+#endif  // CONFIG_TX64X64
 };
 
+static INLINE TX_SIZE get_txsize_from_blocklen(int bs) {
+  switch (bs) {
+    case 4: return TX_4X4;
+    case 8: return TX_8X8;
+    case 16: return TX_16X16;
+    case 32: return TX_32X32;
+#if CONFIG_TX64X64
+    case 64: return TX_64X64;
+#endif  // CONFIG_TX64X64
+    default: assert(0);
+  }
+}
+
 static void filter_intra_predictors_4tap(uint8_t *dst, ptrdiff_t stride, int bs,
                                          const uint8_t *above,
                                          const uint8_t *left, int mode) {
   int k, r, c;
-  int buffer[33][65];
   int mean, ipred;
-  const TX_SIZE tx_size =
-      (bs == 32) ? TX_32X32
-                 : ((bs == 16) ? TX_16X16 : ((bs == 8) ? TX_8X8 : (TX_4X4)));
+#if CONFIG_TX64X64
+  int buffer[65][129];
+#else
+  int buffer[33][65];
+#endif  // CONFIG_TX64X64
+  const TX_SIZE tx_size = get_txsize_from_blocklen(bs);
   const int c0 = av1_filter_intra_taps_4[tx_size][mode][0];
   const int c1 = av1_filter_intra_taps_4[tx_size][mode][1];
   const int c2 = av1_filter_intra_taps_4[tx_size][mode][2];
@@ -1040,11 +1077,13 @@
                                                 const uint16_t *left, int mode,
                                                 int bd) {
   int k, r, c;
-  int preds[33][65];
   int mean, ipred;
-  const TX_SIZE tx_size =
-      (bs == 32) ? TX_32X32
-                 : ((bs == 16) ? TX_16X16 : ((bs == 8) ? TX_8X8 : (TX_4X4)));
+#if CONFIG_TX64X64
+  int preds[65][129];
+#else
+  int preds[33][65];
+#endif  // CONFIG_TX64X64
+  const TX_SIZE tx_size = get_txsize_from_blocklen(bs);
   const int c0 = av1_filter_intra_taps_4[tx_size][mode][0];
   const int c1 = av1_filter_intra_taps_4[tx_size][mode][1];
   const int c2 = av1_filter_intra_taps_4[tx_size][mode][2];