Align a few buffers to 32 byte boundaries

This better supports AVX2 operations.

BUG=aomedia:1533

Change-Id: Ife789ac9193b13b3af7f979b33523b1e1db16f51
diff --git a/av1/common/idct.c b/av1/common/idct.c
index be1ef7e..6bab6ef 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -1683,7 +1683,7 @@
 void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
                         const TxfmParam *txfm_param) {
   const TX_SIZE tx_size = txfm_param->tx_size;
-  DECLARE_ALIGNED(16, uint16_t, tmp[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, tmp[MAX_TX_SQUARE]);
   int tmp_stride = MAX_TX_SIZE;
   int w = tx_size_wide[tx_size];
   int h = tx_size_high[tx_size];
diff --git a/av1/decoder/decoder.h b/av1/decoder/decoder.h
index 5de74ce..e17e58d 100644
--- a/av1/decoder/decoder.h
+++ b/av1/decoder/decoder.h
@@ -36,9 +36,9 @@
 typedef struct TileData {
   AV1_COMMON *cm;
   aom_reader bit_reader;
-  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  DECLARE_ALIGNED(32, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
   DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
   DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_PALETTE_SQUARE]);
 } TileData;
@@ -52,9 +52,9 @@
 } TileBufferDec;
 
 typedef struct AV1Decoder {
-  DECLARE_ALIGNED(16, MACROBLOCKD, mb);
+  DECLARE_ALIGNED(32, MACROBLOCKD, mb);
 
-  DECLARE_ALIGNED(16, AV1_COMMON, common);
+  DECLARE_ALIGNED(32, AV1_COMMON, common);
 
   int ready_for_new_data;