Merge "Fix the bug that PVQ commit broke dering" into nextgenv2
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 24c3631..439a6b4 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -64,7 +64,7 @@
   int max_theta[PVQ_MAX_PARTITIONS];
   int qg[PVQ_MAX_PARTITIONS];
   int k[PVQ_MAX_PARTITIONS];
-  od_coeff y[OD_BSIZE_MAX * OD_BSIZE_MAX];
+  od_coeff y[OD_TXSIZE_MAX * OD_TXSIZE_MAX];
   int nb_bands;
   int off[PVQ_MAX_PARTITIONS];
   int size[PVQ_MAX_PARTITIONS];
diff --git a/av1/common/odintrin.h b/av1/common/odintrin.h
index 6700383..96131f0 100644
--- a/av1/common/odintrin.h
+++ b/av1/common/odintrin.h
@@ -37,12 +37,17 @@
 
 /*Smallest blocks are 4x4*/
 #define OD_LOG_BSIZE0 (2)
-/*There are 4 block sizes total (4x4, 8x8, 16x16 and 32x32).*/
-#define OD_NBSIZES (4)
-/*The log of the maximum length of the side of a block.*/
-#define OD_LOG_BSIZE_MAX (OD_LOG_BSIZE0 + OD_NBSIZES - 1)
+/*There are 5 block sizes total (4x4, 8x8, 16x16, 32x32 and 64x64).*/
+#define OD_NBSIZES (5)
 /*The maximum length of the side of a block.*/
-#define OD_BSIZE_MAX (1 << OD_LOG_BSIZE_MAX)
+#define OD_BSIZE_MAX MAX_SB_SIZE
+
+/*There are 4 transform sizes total in AV1 (4x4, 8x8, 16x16 and 32x32).*/
+#define OD_TXSIZES TX_SIZES
+/*The log of the maximum length of the side of a transform.*/
+#define OD_LOG_TXSIZE_MAX (OD_LOG_BSIZE0 + OD_TXSIZES - 1)
+/*The maximum length of the side of a transform.*/
+#define OD_TXSIZE_MAX (1 << OD_LOG_TXSIZE_MAX)
 
 /**The maximum number of color planes allowed in a single frame.*/
 # define OD_NPLANES_MAX (3)
diff --git a/av1/common/partition.c b/av1/common/partition.c
index 63d9d69..6b9b6fa 100644
--- a/av1/common/partition.c
+++ b/av1/common/partition.c
@@ -93,7 +93,7 @@
 static const int OD_BAND_OFFSETS64[] = {13, 1, 16, 24, 32, 64, 96, 128, 256,
  384, 512, 1024, 1536, 2048, 4096};
 
-const int *const OD_BAND_OFFSETS[OD_NBSIZES + 1] = {
+const int *const OD_BAND_OFFSETS[OD_TXSIZES + 1] = {
   OD_BAND_OFFSETS4,
   OD_BAND_OFFSETS8,
   OD_BAND_OFFSETS16,
@@ -158,7 +158,7 @@
   int bs;
   /* dst + 1 because DC is not included for 4x4 blocks. */
   od_band_from_raster(OD_LAYOUTS[0], dst + 1, src, stride, ty_type);
-  for (bs = 1; bs < OD_NBSIZES; bs++) {
+  for (bs = 1; bs < OD_TXSIZES; bs++) {
     int size;
     int offset;
     /* Length of block size > 4. */
@@ -190,7 +190,7 @@
   int bs;
   /* src + 1 because DC is not included for 4x4 blocks. */
   od_raster_from_band(OD_LAYOUTS[0], dst, stride, ty_type, src + 1);
-  for (bs = 1; bs < OD_NBSIZES; bs++) {
+  for (bs = 1; bs < OD_TXSIZES; bs++) {
     int size;
     int offset;
     /* Length of block size > 4 */
@@ -240,7 +240,7 @@
   int bs;
   /* dst + 1 because DC is not included for 4x4 blocks. */
   od_band_from_raster_16(OD_LAYOUTS[0], dst + 1, src, stride);
-  for (bs = 1; bs < OD_NBSIZES; bs++) {
+  for (bs = 1; bs < OD_TXSIZES; bs++) {
     int size;
     int offset;
     /* Length of block size > 4. */
diff --git a/av1/common/partition.h b/av1/common/partition.h
index c86cb81..5ee7f15 100644
--- a/av1/common/partition.h
+++ b/av1/common/partition.h
@@ -26,7 +26,7 @@
   const int *const band_offsets;
 } band_layout;
 
-extern const int *const OD_BAND_OFFSETS[OD_NBSIZES + 1];
+extern const int *const OD_BAND_OFFSETS[OD_TXSIZES + 1];
 
 void od_raster_to_coding_order(int16_t *dst, int n,  TX_TYPE ty_type,
  const int16_t *src, int stride);
diff --git a/av1/common/pvq.c b/av1/common/pvq.c
index 62f3632..81d0839 100644
--- a/av1/common/pvq.c
+++ b/av1/common/pvq.c
@@ -132,7 +132,7 @@
  OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.),
  OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.)};
 
-const od_val16 *const OD_PVQ_BETA[2][OD_NPLANES_MAX][OD_NBSIZES + 1] = {
+const od_val16 *const OD_PVQ_BETA[2][OD_NPLANES_MAX][OD_TXSIZES + 1] = {
  {{OD_PVQ_BETA4_LUMA, OD_PVQ_BETA8_LUMA,
    OD_PVQ_BETA16_LUMA, OD_PVQ_BETA32_LUMA},
   {OD_PVQ_BETA4_CHROMA, OD_PVQ_BETA8_CHROMA,
@@ -156,7 +156,7 @@
   generic_model_init(&state->pvq_param_model[0]);
   generic_model_init(&state->pvq_param_model[1]);
   generic_model_init(&state->pvq_param_model[2]);
-  for (i = 0; i < 2*OD_NBSIZES; i++) {
+  for (i = 0; i < 2*OD_TXSIZES; i++) {
     ctx->pvq_adapt[4*i + OD_ADAPT_K_Q8] = 384;
     ctx->pvq_adapt[4*i + OD_ADAPT_SUM_EX_Q8] = 256;
     ctx->pvq_adapt[4*i + OD_ADAPT_COUNT_Q8] = 104;
@@ -165,12 +165,12 @@
   ctx->pvq_k1_increment = 128;
   OD_CDFS_INIT(ctx->pvq_k1_cdf, ctx->pvq_k1_increment);
   for (pli = 0; pli < OD_NPLANES_MAX; pli++) {
-    for (bs = 0; bs < OD_NBSIZES; bs++)
+    for (bs = 0; bs < OD_TXSIZES; bs++)
     for (i = 0; i < PVQ_MAX_PARTITIONS; i++) {
       state->pvq_exg[pli][bs][i] = 2 << 16;
     }
   }
-  for (i = 0; i < OD_NBSIZES*PVQ_MAX_PARTITIONS; i++) {
+  for (i = 0; i < OD_TXSIZES*PVQ_MAX_PARTITIONS; i++) {
     state->pvq_ext[i] = is_keyframe ? 24576 : 2 << 16;
   }
   state->pvq_gaintheta_increment = 128;
@@ -195,14 +195,14 @@
 void od_init_qm(int16_t *x, int16_t *x_inv, const int *qm) {
   int i;
   int j;
-  int16_t y[OD_BSIZE_MAX*OD_BSIZE_MAX];
-  int16_t y_inv[OD_BSIZE_MAX*OD_BSIZE_MAX];
+  int16_t y[OD_TXSIZE_MAX*OD_TXSIZE_MAX];
+  int16_t y_inv[OD_TXSIZE_MAX*OD_TXSIZE_MAX];
   int16_t *x1;
   int16_t *x1_inv;
   int off;
   int bs;
   int xydec;
-  for (bs = 0; bs < OD_NBSIZES; bs++) {
+  for (bs = 0; bs < OD_TXSIZES; bs++) {
     for (xydec = 0; xydec < 2; xydec++) {
       off = od_qm_offset(bs, xydec);
       x1 = x + off;
@@ -259,7 +259,7 @@
 int od_qm_get_index(int bs, int band) {
   /* The -band/3 term is due to the fact that we force corresponding horizontal
      and vertical bands to have the same quantization. */
-  OD_ASSERT(bs >= 0 && bs < OD_NBSIZES);
+  OD_ASSERT(bs >= 0 && bs < OD_TXSIZES);
   return bs*(bs + 1) + band - band/3;
 }
 
diff --git a/av1/common/pvq.h b/av1/common/pvq.h
index 371e4bd..5a49a84 100644
--- a/av1/common/pvq.h
+++ b/av1/common/pvq.h
@@ -22,7 +22,7 @@
 extern const uint16_t EXP_CDF_TABLE[][16];
 extern const uint16_t LAPLACE_OFFSET[];
 
-# define PVQ_MAX_PARTITIONS (1 + 3*(OD_NBSIZES-1))
+# define PVQ_MAX_PARTITIONS (1 + 3*(OD_TXSIZES-1))
 
 # define OD_NOREF_ADAPT_SPEED (4)
 /* Normalized lambda for PVQ quantizer. Since we normalize the gain by q, the
@@ -57,7 +57,7 @@
 #define OD_QM_INV_SCALE_1 (1./OD_QM_INV_SCALE)
 #endif
 #define OD_QM_OFFSET(bs) ((((1 << 2*bs) - 1) << 2*OD_LOG_BSIZE0)/3)
-#define OD_QM_STRIDE (OD_QM_OFFSET(OD_NBSIZES))
+#define OD_QM_STRIDE (OD_QM_OFFSET(OD_TXSIZES))
 #define OD_QM_BUFFER_SIZE (2*OD_QM_STRIDE)
 
 #if !defined(OD_FLOAT_PVQ)
@@ -86,13 +86,13 @@
 #define OD_CGAIN_SCALE_2 (OD_CGAIN_SCALE_1*OD_CGAIN_SCALE_1)
 
 /* Largest PVQ partition is half the coefficients of largest block size. */
-#define MAXN (OD_BSIZE_MAX*OD_BSIZE_MAX/2)
+#define MAXN (OD_TXSIZE_MAX*OD_TXSIZE_MAX/2)
 
 #define OD_COMPAND_SHIFT (8 + OD_COEFF_SHIFT)
 #define OD_COMPAND_SCALE (1 << OD_COMPAND_SHIFT)
 #define OD_COMPAND_SCALE_1 (1./OD_COMPAND_SCALE)
 
-#define OD_QM_SIZE (OD_NBSIZES*(OD_NBSIZES + 1))
+#define OD_QM_SIZE (OD_TXSIZES*(OD_TXSIZES + 1))
 
 #define OD_FLAT_QM 0
 #define OD_HVS_QM  1
@@ -110,7 +110,7 @@
 typedef struct od_pvq_codeword_ctx od_pvq_codeword_ctx;
 
 struct od_pvq_codeword_ctx {
-  int                 pvq_adapt[2*OD_NBSIZES*OD_NSB_ADAPT_CTXS];
+  int                 pvq_adapt[2*OD_TXSIZES*OD_NSB_ADAPT_CTXS];
   int                 pvq_k1_increment;
   /* CDFs are size 16 despite the fact that we're using less than that. */
   uint16_t            pvq_k1_cdf[12][16];
@@ -121,12 +121,12 @@
 struct od_pvq_adapt_ctx {
   od_pvq_codeword_ctx pvq_codeword_ctx;
   generic_encoder     pvq_param_model[3];
-  int                 pvq_ext[OD_NBSIZES*PVQ_MAX_PARTITIONS];
-  int                 pvq_exg[OD_NPLANES_MAX][OD_NBSIZES][PVQ_MAX_PARTITIONS];
+  int                 pvq_ext[OD_TXSIZES*PVQ_MAX_PARTITIONS];
+  int                 pvq_exg[OD_NPLANES_MAX][OD_TXSIZES][PVQ_MAX_PARTITIONS];
   int                 pvq_gaintheta_increment;
-  uint16_t        pvq_gaintheta_cdf[2*OD_NBSIZES*PVQ_MAX_PARTITIONS][16];
+  uint16_t        pvq_gaintheta_cdf[2*OD_TXSIZES*PVQ_MAX_PARTITIONS][16];
   int                 pvq_skip_dir_increment;
-  uint16_t        pvq_skip_dir_cdf[2*(OD_NBSIZES-1)][7];
+  uint16_t        pvq_skip_dir_cdf[2*(OD_TXSIZES-1)][7];
 };
 
 void od_adapt_pvq_ctx_reset(od_pvq_adapt_ctx *state, int is_keyframe);
@@ -141,7 +141,7 @@
 
 int od_qm_get_index(int bs, int band);
 
-extern const od_val16 *const OD_PVQ_BETA[2][OD_NPLANES_MAX][OD_NBSIZES + 1];
+extern const od_val16 *const OD_PVQ_BETA[2][OD_NPLANES_MAX][OD_TXSIZES + 1];
 
 void od_init_qm(int16_t *x, int16_t *x_inv, const int *qm);
 int od_compute_householder(od_val16 *r, int n, od_val32 gr, int *sign,
diff --git a/av1/common/pvq_state.c b/av1/common/pvq_state.c
index 45d5184..2329d66 100644
--- a/av1/common/pvq_state.c
+++ b/av1/common/pvq_state.c
@@ -20,7 +20,7 @@
   OD_CDFS_INIT(adapt->skip_cdf, adapt->skip_increment >> 2);
   for (pli = 0; pli < OD_NPLANES_MAX; pli++) {
     generic_model_init(&adapt->model_dc[pli]);
-    for (i = 0; i < OD_NBSIZES; i++) {
+    for (i = 0; i < OD_TXSIZES; i++) {
       adapt->ex_g[pli][i] = 8;
     }
     for (i = 0; i < 4; i++) {
diff --git a/av1/common/pvq_state.h b/av1/common/pvq_state.h
index 6cf56fe..0519451 100644
--- a/av1/common/pvq_state.h
+++ b/av1/common/pvq_state.h
@@ -30,11 +30,11 @@
 
   generic_encoder model_dc[OD_NPLANES_MAX];
 
-  int ex_dc[OD_NPLANES_MAX][OD_NBSIZES][3];
-  int ex_g[OD_NPLANES_MAX][OD_NBSIZES];
+  int ex_dc[OD_NPLANES_MAX][OD_TXSIZES][3];
+  int ex_g[OD_NPLANES_MAX][OD_TXSIZES];
 
   /* Joint skip flag for DC and AC */
-  uint16_t skip_cdf[OD_NBSIZES*2][4];
+  uint16_t skip_cdf[OD_TXSIZES*2][4];
   int skip_increment;
 };
 
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 1f707a7..fb240d7 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -329,11 +329,11 @@
   // int use_activity_masking = dec->use_activity_masking;
   int use_activity_masking = 0;
 
-  DECLARE_ALIGNED(16, int16_t, dqcoeff_pvq[OD_BSIZE_MAX * OD_BSIZE_MAX]);
-  DECLARE_ALIGNED(16, int16_t, ref_coeff_pvq[OD_BSIZE_MAX * OD_BSIZE_MAX]);
+  DECLARE_ALIGNED(16, int16_t, dqcoeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+  DECLARE_ALIGNED(16, int16_t, ref_coeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
 
-  od_coeff ref_int32[OD_BSIZE_MAX * OD_BSIZE_MAX];
-  od_coeff out_int32[OD_BSIZE_MAX * OD_BSIZE_MAX];
+  od_coeff ref_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX];
+  od_coeff out_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX];
 
   od_raster_to_coding_order(ref_coeff_pvq, blk_size, tx_type, ref_coeff,
                             blk_size);
diff --git a/av1/decoder/decoder.h b/av1/decoder/decoder.h
index 0ee922f..f50da1c 100644
--- a/av1/decoder/decoder.h
+++ b/av1/decoder/decoder.h
@@ -45,7 +45,7 @@
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
 #if CONFIG_PVQ
   /* forward transformed predicted image, a reference for PVQ */
-  DECLARE_ALIGNED(16, tran_low_t, pvq_ref_coeff[OD_BSIZE_MAX * OD_BSIZE_MAX]);
+  DECLARE_ALIGNED(16, tran_low_t, pvq_ref_coeff[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
 #endif
 #if CONFIG_PALETTE
   DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
@@ -61,7 +61,7 @@
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
 #if CONFIG_PVQ
   /* forward transformed predicted image, a reference for PVQ */
-  DECLARE_ALIGNED(16, tran_low_t, pvq_ref_coeff[OD_BSIZE_MAX * OD_BSIZE_MAX]);
+  DECLARE_ALIGNED(16, tran_low_t, pvq_ref_coeff[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
 #endif
 #if CONFIG_PALETTE
   DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
diff --git a/av1/decoder/pvq_decoder.c b/av1/decoder/pvq_decoder.c
index 2340605..1cc75f8 100644
--- a/av1/decoder/pvq_decoder.c
+++ b/av1/decoder/pvq_decoder.c
@@ -348,7 +348,7 @@
       pvq_decode_partition(dec->ec, q, size[i],
        model, &dec->state.adapt, exg + i, ext + i, ref + off[i], out + off[i],
        &noref[i], beta[i], robust, is_keyframe, pli,
-       (pli != 0)*OD_NBSIZES*PVQ_MAX_PARTITIONS + bs*PVQ_MAX_PARTITIONS + i,
+       (pli != 0)*OD_TXSIZES*PVQ_MAX_PARTITIONS + bs*PVQ_MAX_PARTITIONS + i,
        &cfl, i == 0 && (i < nb_bands - 1), skip_rest, i, &skip[i],
        qm + off[i], qm_inv + off[i]);
       if (i == 0 && !skip_rest[0] && bs > 0) {
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index ee62961..4b47dd4 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -2004,7 +2004,7 @@
                     &w->ec, pvq->qg[i], pvq->theta[i], pvq->max_theta[i],
                     pvq->y + pvq->off[i], pvq->size[i], pvq->k[i], model, adapt,
                     exg + i, ext + i, robust || is_keyframe,
-                    (plane != 0) * OD_NBSIZES * PVQ_MAX_PARTITIONS +
+                    (plane != 0) * OD_TXSIZES * PVQ_MAX_PARTITIONS +
                         pvq->bs * PVQ_MAX_PARTITIONS + i,
                     is_keyframe, i == 0 && (i < pvq->nb_bands - 1),
                     pvq->skip_rest, encode_flip, flip);
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 91040db..dfde235 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -1391,13 +1391,13 @@
 #if PVQ_CHROMA_RD
   double save_pvq_lambda;
 #endif
-  DECLARE_ALIGNED(16, int16_t, coeff_pvq[OD_BSIZE_MAX * OD_BSIZE_MAX]);
-  DECLARE_ALIGNED(16, int16_t, ref_coeff_pvq[OD_BSIZE_MAX * OD_BSIZE_MAX]);
-  DECLARE_ALIGNED(16, int16_t, dqcoeff_pvq[OD_BSIZE_MAX * OD_BSIZE_MAX]);
+  DECLARE_ALIGNED(16, int16_t, coeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+  DECLARE_ALIGNED(16, int16_t, ref_coeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+  DECLARE_ALIGNED(16, int16_t, dqcoeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
 
-  DECLARE_ALIGNED(16, int32_t, in_int32[OD_BSIZE_MAX * OD_BSIZE_MAX]);
-  DECLARE_ALIGNED(16, int32_t, ref_int32[OD_BSIZE_MAX * OD_BSIZE_MAX]);
-  DECLARE_ALIGNED(16, int32_t, out_int32[OD_BSIZE_MAX * OD_BSIZE_MAX]);
+  DECLARE_ALIGNED(16, int32_t, in_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+  DECLARE_ALIGNED(16, int32_t, ref_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
+  DECLARE_ALIGNED(16, int32_t, out_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
 
   *eob = 0;
 
diff --git a/av1/encoder/pvq_encoder.c b/av1/encoder/pvq_encoder.c
index 6143fe8..2d8340d 100644
--- a/av1/encoder/pvq_encoder.c
+++ b/av1/encoder/pvq_encoder.c
@@ -782,7 +782,7 @@
   int max_theta[PVQ_MAX_PARTITIONS];
   int qg[PVQ_MAX_PARTITIONS];
   int k[PVQ_MAX_PARTITIONS];
-  od_coeff y[OD_BSIZE_MAX*OD_BSIZE_MAX];
+  od_coeff y[OD_TXSIZE_MAX*OD_TXSIZE_MAX];
   int *exg;
   int *ext;
   int nb_bands;
@@ -895,9 +895,9 @@
   if (pvq_info)
     pvq_info->ac_dc_coded = 2 + (out[0] != 0);
 #if OD_SIGNAL_Q_SCALING
-  if (bs == OD_NBSIZES - 1 && pli == 0) {
-    od_encode_quantizer_scaling(enc, q_scaling, bx >> (OD_NBSIZES - 1),
-     by >> (OD_NBSIZES - 1), 0);
+  if (bs == OD_TXSIZES - 1 && pli == 0) {
+    od_encode_quantizer_scaling(enc, q_scaling, bx >> (OD_TXSIZES - 1),
+     by >> (OD_TXSIZES - 1), 0);
   }
 #endif
   cfl_encoded = 0;
@@ -934,7 +934,7 @@
     if (i == 0 || (!skip_rest && !(skip_dir & (1 << ((i - 1)%3))))) {
       pvq_encode_partition(&enc->ec, qg[i], theta[i], max_theta[i], y + off[i],
        size[i], k[i], model, &enc->state.adapt, exg + i, ext + i,
-       robust || is_keyframe, (pli != 0)*OD_NBSIZES*PVQ_MAX_PARTITIONS
+       robust || is_keyframe, (pli != 0)*OD_TXSIZES*PVQ_MAX_PARTITIONS
        + bs*PVQ_MAX_PARTITIONS + i, is_keyframe, i == 0 && (i < nb_bands - 1),
        skip_rest, encode_flip, flip);
     }
@@ -998,14 +998,14 @@
     if (pvq_info)
       pvq_info->ac_dc_coded = (out[0] != 0);
 #if OD_SIGNAL_Q_SCALING
-    if (bs == OD_NBSIZES - 1 && pli == 0) {
+    if (bs == OD_TXSIZES - 1 && pli == 0) {
       int skip;
       skip = out[0] == 0;
       if (skip) {
         q_scaling = 0;
       }
-      od_encode_quantizer_scaling(enc, q_scaling, bx >> (OD_NBSIZES - 1),
-       by >> (OD_NBSIZES - 1), skip);
+      od_encode_quantizer_scaling(enc, q_scaling, bx >> (OD_TXSIZES - 1),
+       by >> (OD_TXSIZES - 1), skip);
     }
 #endif
     if (is_keyframe) for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = 0;