Reduce dqcoeff array size in decoder

The decoding process handles detokenization and reconstruction per
transform block sequentially. There is no need to offset the dqcoeff
buffer according to the transform block index. This allows to
reduce the memory spill and improve cache performance.

Change-Id: Ibb8bfe532a7a08fcabaf6d42cbec1e986901d32d
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index d293cf2..fd8fe3f 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -188,8 +188,11 @@
 #endif
 
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+#if CONFIG_VP9_ENCODER
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[64 * 64]);
-
+#else
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+#endif
   int lossless;
   int corrupted;
 
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index e81fa04..6ddfcbb 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -188,7 +188,7 @@
   struct macroblockd_plane *const pd = &xd->plane[plane];
   if (eob > 0) {
     TX_TYPE tx_type = DCT_DCT;
-    tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+    tran_low_t *const dqcoeff = pd->dqcoeff;
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       if (xd->lossless) {
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 3304e64..2e26059 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -217,7 +217,7 @@
                                                pd->left_context + y);
   const scan_order *so = get_scan(xd, tx_size, pd->plane_type, block);
   const int eob = decode_coefs(xd, pd->plane_type,
-                               BLOCK_OFFSET(pd->dqcoeff, block), tx_size,
+                               pd->dqcoeff, tx_size,
                                dequant, ctx, so->scan, so->neighbors, r);
   vp9_set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y);
   return eob;