New experiment: Perceptual Vector Quantization from Daala

PVQ replaces the scalar quantizer and coefficient coding with a new
design originally developed in Daala. It currently depends on the
Daala entropy coder although it could be adapted to work with another
entropy coder if needed:
./configure --enable-experimental --enable-daala_ec --enable-pvq

The version of PVQ in this commit is adapted from the following
revision of Daala:
https://github.com/xiph/daala/commit/fb51c1ade6a31b668a0157d89de8f0a4493162a8

More information about PVQ:
- https://people.xiph.org/~jm/daala/pvq_demo/
- https://jmvalin.ca/papers/spie_pvq.pdf

The following files are copied as-is from Daala with minimal
adaptations, therefore we disable clang-format on those files
to make it easier to synchronize the AV1 and Daala codebases in the future:
 av1/common/generic_code.c
 av1/common/generic_code.h
 av1/common/laplace_tables.c
 av1/common/partition.c
 av1/common/partition.h
 av1/common/pvq.c
 av1/common/pvq.h
 av1/common/state.c
 av1/common/state.h
 av1/common/zigzag.h
 av1/common/zigzag16.c
 av1/common/zigzag32.c
 av1/common/zigzag4.c
 av1/common/zigzag64.c
 av1/common/zigzag8.c
 av1/decoder/decint.h
 av1/decoder/generic_decoder.c
 av1/decoder/laplace_decoder.c
 av1/decoder/pvq_decoder.c
 av1/decoder/pvq_decoder.h
 av1/encoder/daala_compat_enc.c
 av1/encoder/encint.h
 av1/encoder/generic_encoder.c
 av1/encoder/laplace_encoder.c
 av1/encoder/pvq_encoder.c
 av1/encoder/pvq_encoder.h

Known issues:
- Lossless mode is not supported, '--lossless=1' will give the same result as
'--end-usage=q --cq-level=1'.
- High bit depth is not supported by PVQ.

Change-Id: I1ae0d6517b87f4c1ccea944b2e12dc906979f25e
diff --git a/av1/decoder/decint.h b/av1/decoder/decint.h
new file mode 100644
index 0000000..99dbc43
--- /dev/null
+++ b/av1/decoder/decint.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#if !defined(_decint_H)
+# define _decint_H (1)
+# include "av1/common/pvq_state.h"
+# include "aom_dsp/entdec.h"
+
+typedef struct daala_dec_ctx daala_dec_ctx;
+
+typedef struct daala_dec_ctx od_dec_ctx;
+
+
+struct daala_dec_ctx {
+  /* Stores context-adaptive CDFs for PVQ. */
+  od_state state;
+  /* Daala entropy decoder. */
+  od_ec_dec *ec;
+  /* Mode of quantization matrice : FLAT (0) or HVS (1) */
+  int qm;
+};
+
+#endif
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 026dcbc..ee067f7 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -57,6 +57,16 @@
 #define MAX_AV1_HEADER_SIZE 80
 #define ACCT_STR __func__
 
+#if CONFIG_PVQ
+#include "av1/decoder/pvq_decoder.h"
+#include "av1/encoder/encodemb.h"
+
+#include "aom_dsp/entdec.h"
+#include "av1/common/partition.h"
+#include "av1/decoder/decint.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#endif
+
 static struct aom_read_bit_buffer *init_read_bit_buffer(
     AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
     const uint8_t *data_end, uint8_t clear_data[MAX_AV1_HEADER_SIZE]);
@@ -299,6 +309,141 @@
   memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0]));
 }
 
+#if CONFIG_PVQ
+static int av1_pvq_decode_helper(od_dec_ctx *dec, int16_t *ref_coeff,
+                                 int16_t *dqcoeff, int16_t *quant, int pli,
+                                 int bs, TX_TYPE tx_type, int xdec,
+                                 int ac_dc_coded) {
+  unsigned int flags;  // used for daala's stream analyzer.
+  int off;
+  const int is_keyframe = 0;
+  const int has_dc_skip = 1;
+  int quant_shift = bs == TX_32X32 ? 1 : 0;
+  // DC quantizer for PVQ
+  int pvq_dc_quant;
+  int lossless = (quant[0] == 0);
+  const int blk_size = tx_size_wide[bs];
+  int eob = 0;
+  int i;
+  // TODO(yushin) : To enable activity masking,
+  // int use_activity_masking = dec->use_activity_masking;
+  int use_activity_masking = 0;
+
+  DECLARE_ALIGNED(16, int16_t, dqcoeff_pvq[OD_BSIZE_MAX * OD_BSIZE_MAX]);
+  DECLARE_ALIGNED(16, int16_t, ref_coeff_pvq[OD_BSIZE_MAX * OD_BSIZE_MAX]);
+
+  od_coeff ref_int32[OD_BSIZE_MAX * OD_BSIZE_MAX];
+  od_coeff out_int32[OD_BSIZE_MAX * OD_BSIZE_MAX];
+
+  od_raster_to_coding_order(ref_coeff_pvq, blk_size, tx_type, ref_coeff,
+                            blk_size);
+
+  if (lossless)
+    pvq_dc_quant = 1;
+  else {
+    // TODO(yushin): Enable this for activity masking,
+    // when pvq_qm_q4 is available in AOM.
+    // pvq_dc_quant = OD_MAXI(1, quant*
+    // dec->state.pvq_qm_q4[pli][od_qm_get_index(bs, 0)] >> 4);
+    pvq_dc_quant = OD_MAXI(1, quant[0] >> quant_shift);
+  }
+
+  off = od_qm_offset(bs, xdec);
+
+  // copy int16 inputs to int32
+  for (i = 0; i < blk_size * blk_size; i++) ref_int32[i] = ref_coeff_pvq[i];
+
+  od_pvq_decode(dec, ref_int32, out_int32, (int)quant[1] >> quant_shift, pli,
+                bs, OD_PVQ_BETA[use_activity_masking][pli][bs],
+                OD_ROBUST_STREAM,
+                is_keyframe, &flags, ac_dc_coded, dec->state.qm + off,
+                dec->state.qm_inv + off);
+
+  // copy int32 result back to int16
+  for (i = 0; i < blk_size * blk_size; i++) dqcoeff_pvq[i] = out_int32[i];
+
+  if (!has_dc_skip || dqcoeff_pvq[0]) {
+    dqcoeff_pvq[0] =
+        has_dc_skip + generic_decode(dec->ec, &dec->state.adapt.model_dc[pli],
+                                     -1, &dec->state.adapt.ex_dc[pli][bs][0], 2,
+                                     "dc:mag");
+    if (dqcoeff_pvq[0])
+      dqcoeff_pvq[0] *= od_ec_dec_bits(dec->ec, 1, "dc:sign") ? -1 : 1;
+  }
+  dqcoeff_pvq[0] = dqcoeff_pvq[0] * pvq_dc_quant + ref_coeff_pvq[0];
+
+  od_coding_order_to_raster(dqcoeff, blk_size, tx_type, dqcoeff_pvq, blk_size);
+
+  eob = blk_size * blk_size;
+
+  return eob;
+}
+
+static int av1_pvq_decode_helper2(
+    MACROBLOCKD *const xd, MB_MODE_INFO *const mbmi, int plane, int row,
+    int col, TX_SIZE tx_size, TX_TYPE tx_type ) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  // transform block size in pixels
+  int tx_blk_size = tx_size_wide[tx_size];
+  int i, j;
+  tran_low_t *pvq_ref_coeff = pd->pvq_ref_coeff;
+  const int diff_stride = tx_blk_size;
+  int16_t *pred = pd->pred;
+  tran_low_t *const dqcoeff = pd->dqcoeff;
+  int ac_dc_coded;  // bit0: DC coded, bit1 : AC coded
+  uint8_t *dst;
+  int eob;
+
+  eob = 0;
+  dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
+
+  // decode ac/dc coded flag. bit0: DC coded, bit1 : AC coded
+  // NOTE : we don't use 5 symbols for luma here in aom codebase,
+  // since block partition is taken care of by aom.
+  // So, only AC/DC skip info is coded
+  ac_dc_coded = od_decode_cdf_adapt(
+      xd->daala_dec.ec,
+      xd->daala_dec.state.adapt.skip_cdf[2 * tx_size + (plane != 0)], 4,
+      xd->daala_dec.state.adapt.skip_increment, "skip");
+
+  if (ac_dc_coded) {
+    int xdec = pd->subsampling_x;
+    int seg_id = mbmi->segment_id;
+    int16_t *quant;
+    FWD_TXFM_PARAM fwd_txfm_param;
+
+    for (j = 0; j < tx_blk_size; j++)
+      for (i = 0; i < tx_blk_size; i++) {
+        pred[diff_stride * j + i] = dst[pd->dst.stride * j + i];
+      }
+
+    fwd_txfm_param.tx_type = tx_type;
+    fwd_txfm_param.tx_size = tx_size;
+    fwd_txfm_param.fwd_txfm_opt = FWD_TXFM_OPT_NORMAL;
+    fwd_txfm_param.rd_transform = 0;
+    fwd_txfm_param.lossless = xd->lossless[seg_id];
+
+    fwd_txfm(pred, pvq_ref_coeff, diff_stride, &fwd_txfm_param);
+
+    quant = &pd->seg_dequant[seg_id][0];  // aom's quantizer
+
+    eob = av1_pvq_decode_helper(&xd->daala_dec, pvq_ref_coeff, dqcoeff, quant,
+                                plane, tx_size, tx_type, xdec, ac_dc_coded);
+
+    // Since av1 does not have separate inverse transform
+    // but also contains adding to predicted image,
+    // pass blank dummy image to av1_inv_txfm_add_*x*(), i.e. set dst as zeros
+    for (j = 0; j < tx_blk_size; j++)
+      for (i = 0; i < tx_blk_size; i++) dst[j * pd->dst.stride + i] = 0;
+
+    inverse_transform_block(xd, plane, tx_type, tx_size, dst,
+                            pd->dst.stride, eob);
+  }
+
+  return eob;
+}
+#endif
+
 static void predict_and_reconstruct_intra_block(AV1_COMMON *cm,
                                                 MACROBLOCKD *const xd,
 #if CONFIG_ANS
@@ -314,6 +459,10 @@
   PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
   uint8_t *dst;
   int block_idx = (row << 1) + col;
+#if CONFIG_PVQ
+  (void)cm;
+  (void)r;
+#endif
   dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
 
   if (mbmi->sb_type < BLOCK_8X8)
@@ -324,6 +473,7 @@
 
   if (!mbmi->skip) {
     TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, tx_size);
+#if !CONFIG_PVQ
     const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0);
     int16_t max_scan_line = 0;
     const int eob =
@@ -335,6 +485,9 @@
     if (eob)
       inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
                               max_scan_line, eob);
+#else
+    av1_pvq_decode_helper2(xd, mbmi, plane, row, col, tx_size, tx_type);
+#endif
   }
 }
 
@@ -404,6 +557,13 @@
   PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
   int block_idx = (row << 1) + col;
   TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, tx_size);
+#if CONFIG_PVQ
+  int eob;
+  (void)cm;
+  (void)r;
+#endif
+
+#if !CONFIG_PVQ
   const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 1);
   int16_t max_scan_line = 0;
   const int eob =
@@ -416,6 +576,9 @@
     inverse_transform_block(xd, plane, tx_type, tx_size,
                             &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
                             pd->dst.stride, max_scan_line, eob);
+#else
+  eob = av1_pvq_decode_helper2(xd, mbmi, plane, row, col, tx_size, tx_type);
+#endif
   return eob;
 }
 #endif  // !CONFIG_VAR_TX || CONFIG_SUPER_TX
@@ -1507,6 +1670,11 @@
 #endif
                              n8x8_l2);
   subsize = subsize_lookup[partition][bsize];  // get_subsize(bsize, partition);
+
+#if CONFIG_PVQ
+  assert(partition < PARTITION_TYPES);
+  assert(subsize < BLOCK_SIZES);
+#endif
 #if CONFIG_SUPERTX
   if (!frame_is_intra_only(cm) && partition != PARTITION_NONE &&
       bsize <= MAX_SUPERTX_BLOCK_SIZE && !supertx_enabled && !xd->lossless[0]) {
@@ -1897,6 +2065,7 @@
 }
 #endif
 
+#if !CONFIG_PVQ
 static void read_coef_probs_common(av1_coeff_probs_model *coef_probs,
                                    aom_reader *r) {
   int i, j, k, l, m;
@@ -1921,6 +2090,7 @@
   for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
     read_coef_probs_common(fc->coef_probs[tx_size], r);
 }
+#endif
 
 static void setup_segmentation(AV1_COMMON *const cm,
                                struct aom_read_bit_buffer *rb) {
@@ -2767,6 +2937,18 @@
 }
 #endif  // CONFIG_EXT_TILE
 
+#if CONFIG_PVQ
+static void daala_dec_init(daala_dec_ctx *daala_dec, od_ec_dec *ec) {
+  daala_dec->ec = ec;
+  od_adapt_ctx_reset(&daala_dec->state.adapt, 0);
+
+  daala_dec->qm = OD_FLAT_QM;
+
+  od_init_qm(daala_dec->state.qm, daala_dec->state.qm_inv,
+             daala_dec->qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT);
+}
+#endif
+
 static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
                                    const uint8_t *data_end) {
   AV1_COMMON *const cm = &pbi->common;
@@ -2849,6 +3031,9 @@
               ? &cm->counts
               : NULL;
       av1_zero(td->dqcoeff);
+#if CONFIG_PVQ
+      av1_zero(tile_data->pvq_ref_coeff);
+#endif
       av1_tile_init(&td->xd.tile, td->cm, tile_row, tile_col);
 #if !CONFIG_ANS
       setup_bool_decoder(buf->data, data_end, buf->size, &cm->error,
@@ -2864,7 +3049,14 @@
         td->bit_reader.accounting = NULL;
       }
 #endif
-      av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+      av1_init_macroblockd(cm, &td->xd,
+#if CONFIG_PVQ
+                           td->pvq_ref_coeff,
+#endif
+                           td->dqcoeff);
+#if CONFIG_PVQ
+      daala_dec_init(&td->xd.daala_dec, &td->bit_reader.ec);
+#endif
 #if CONFIG_PALETTE
       td->xd.plane[0].color_index_map = td->color_index_map[0];
       td->xd.plane[1].color_index_map = td->color_index_map[1];
@@ -3196,7 +3388,14 @@
                             &twd->bit_reader, pbi->decrypt_cb,
                             pbi->decrypt_state);
 #endif  // CONFIG_ANS
-        av1_init_macroblockd(cm, &twd->xd, twd->dqcoeff);
+        av1_init_macroblockd(cm, &twd->xd,
+#if CONFIG_PVQ
+                             twd->pvq_ref_coeff,
+#endif
+                             twd->dqcoeff);
+#if CONFIG_PVQ
+      daala_dec_init(&twd->xd.daala_dec, &twd->bit_reader.ec);
+#endif
 #if CONFIG_PALETTE
         twd->xd.plane[0].color_index_map = twd->color_index_map[0];
         twd->xd.plane[1].color_index_map = twd->color_index_map[1];
@@ -3734,6 +3933,7 @@
 
   if (cm->tx_mode == TX_MODE_SELECT) read_tx_size_probs(fc, &r);
 
+#if !CONFIG_PVQ
   read_coef_probs(fc, cm->tx_mode, &r);
 
 #if CONFIG_VAR_TX
@@ -3745,8 +3945,8 @@
       av1_diff_update_prob(&r, &fc->rect_tx_prob[i], ACCT_STR);
   }
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-#endif
-
+#endif  // CONFIG_VAR_TX
+#endif  // !CONFIG_PVQ
   for (k = 0; k < SKIP_CONTEXTS; ++k)
     av1_diff_update_prob(&r, &fc->skip_probs[k], ACCT_STR);
 
diff --git a/av1/decoder/decoder.c b/av1/decoder/decoder.c
index c3099ba..7547656 100644
--- a/av1/decoder/decoder.c
+++ b/av1/decoder/decoder.c
@@ -33,7 +33,10 @@
 
 #include "av1/decoder/decodeframe.h"
 #include "av1/decoder/decoder.h"
+
+#if !CONFIG_PVQ
 #include "av1/decoder/detokenize.h"
+#endif
 
 static void initialize_dec(void) {
   static volatile int init_done = 0;
diff --git a/av1/decoder/decoder.h b/av1/decoder/decoder.h
index 262995a..0ee922f 100644
--- a/av1/decoder/decoder.h
+++ b/av1/decoder/decoder.h
@@ -26,6 +26,12 @@
 #include "av1/common/accounting.h"
 #endif
 
+#if CONFIG_PVQ
+#include "aom_dsp/entdec.h"
+#include "av1/decoder/decint.h"
+#include "av1/encoder/encodemb.h"
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -37,6 +43,10 @@
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
+#if CONFIG_PVQ
+  /* forward transformed predicted image, a reference for PVQ */
+  DECLARE_ALIGNED(16, tran_low_t, pvq_ref_coeff[OD_BSIZE_MAX * OD_BSIZE_MAX]);
+#endif
 #if CONFIG_PALETTE
   DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
 #endif  // CONFIG_PALETTE
@@ -49,6 +59,10 @@
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
+#if CONFIG_PVQ
+  /* forward transformed predicted image, a reference for PVQ */
+  DECLARE_ALIGNED(16, tran_low_t, pvq_ref_coeff[OD_BSIZE_MAX * OD_BSIZE_MAX]);
+#endif
 #if CONFIG_PALETTE
   DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
 #endif  // CONFIG_PALETTE
diff --git a/av1/decoder/detokenize.c b/av1/decoder/detokenize.c
index 795b1b0..0f183f2 100644
--- a/av1/decoder/detokenize.c
+++ b/av1/decoder/detokenize.c
@@ -9,9 +9,11 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#if !CONFIG_PVQ
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
-
+#endif
+#if !CONFIG_PVQ
 #if CONFIG_ANS
 #include "aom_dsp/ans.h"
 #endif  // CONFIG_ANS
@@ -356,3 +358,4 @@
   av1_set_contexts(xd, pd, tx_size, eob > 0, x, y);
   return eob;
 }
+#endif
diff --git a/av1/decoder/detokenize.h b/av1/decoder/detokenize.h
index 1eb1e6c..ec68665 100644
--- a/av1/decoder/detokenize.h
+++ b/av1/decoder/detokenize.h
@@ -9,6 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#if !CONFIG_PVQ
 #ifndef AV1_DECODER_DETOKENIZE_H_
 #define AV1_DECODER_DETOKENIZE_H_
 
@@ -39,5 +40,5 @@
 #ifdef __cplusplus
 }  // extern "C"
 #endif
-
 #endif  // AV1_DECODER_DETOKENIZE_H_
+#endif
diff --git a/av1/decoder/generic_decoder.c b/av1/decoder/generic_decoder.c
new file mode 100644
index 0000000..86187fa
--- /dev/null
+++ b/av1/decoder/generic_decoder.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stdio.h>
+
+#include "aom_dsp/entdec.h"
+#include "av1/common/generic_code.h"
+#include "av1/common/odintrin.h"
+#include "pvq_decoder.h"
+
+/** Decodes a value from 0 to N-1 (with N up to 16) based on a cdf and adapts
+ * the cdf accordingly.
+ *
+ * @param [in,out] enc   range encoder
+ * @param [in,out] cdf   CDF of the variable (Q15)
+ * @param [in]     n     number of values possible
+ * @param [in,out] count number of symbols encoded with that cdf so far
+ * @param [in]     rate  adaptation rate shift (smaller is faster)
+ * @return decoded variable
+ */
+int od_decode_cdf_adapt_q15_(od_ec_dec *ec, uint16_t *cdf, int n,
+ int *count, int rate OD_ACC_STR) {
+  int val;
+  int i;
+  if (*count == 0) {
+    int ft;
+    ft = cdf[n - 1];
+    for (i = 0; i < n; i++) {
+      cdf[i] = cdf[i]*32768/ft;
+    }
+  }
+  val = od_ec_decode_cdf_q15(ec, cdf, n);
+  od_cdf_adapt_q15(val, cdf, n, count, rate);
+  return val;
+}
+
+/** Decodes a value from 0 to N-1 (with N up to 16) based on a cdf and adapts
+ * the cdf accordingly.
+ *
+ * @param [in,out] enc   range encoder
+ * @param [in]     cdf   CDF of the variable (Q15)
+ * @param [in]     n     number of values possible
+ * @param [in]     increment adaptation speed (Q15)
+ *
+ * @retval decoded variable
+ */
+int od_decode_cdf_adapt_(od_ec_dec *ec, uint16_t *cdf, int n,
+ int increment OD_ACC_STR) {
+  int i;
+  int val;
+  val = od_ec_decode_cdf_unscaled(ec, cdf, n);
+  if (cdf[n-1] + increment > 32767) {
+    for (i = 0; i < n; i++) {
+      /* Second term ensures that the pdf is non-null */
+      cdf[i] = (cdf[i] >> 1) + i + 1;
+    }
+  }
+  for (i = val; i < n; i++) cdf[i] += increment;
+  return val;
+}
+
+/** Encodes a random variable using a "generic" model, assuming that the
+ * distribution is one-sided (zero and up), has a single mode, and decays
+ * exponentially past the model.
+ *
+ * @param [in,out] dec   range decoder
+ * @param [in,out] model generic probability model
+ * @param [in]     x     variable being encoded
+ * @param [in,out] ExQ16 expectation of x (adapted)
+ * @param [in]     integration integration period of ExQ16 (leaky average over
+ * 1<<integration samples)
+ *
+ * @retval decoded variable x
+ */
+int generic_decode_(od_ec_dec *dec, generic_encoder *model, int max,
+ int *ex_q16, int integration OD_ACC_STR) {
+  int lg_q1;
+  int shift;
+  int id;
+  uint16_t *cdf;
+  int xs;
+  int lsb;
+  int x;
+  int ms;
+  lsb = 0;
+  if (max == 0) return 0;
+  lg_q1 = log_ex(*ex_q16);
+  /* If expectation is too large, shift x to ensure that
+     all we have past xs=15 is the exponentially decaying tail
+     of the distribution. */
+  shift = OD_MAXI(0, (lg_q1 - 5) >> 1);
+  /* Choose the cdf to use: we have two per "octave" of ExQ16. */
+  id = OD_MINI(GENERIC_TABLES - 1, lg_q1);
+  cdf = model->cdf[id];
+  ms = (max + (1 << shift >> 1)) >> shift;
+  if (max == -1) xs = od_ec_decode_cdf_unscaled(dec, cdf, 16);
+  else xs = od_ec_decode_cdf_unscaled(dec, cdf, OD_MINI(ms + 1, 16));
+  if (xs == 15) {
+    int e;
+    unsigned decay;
+    /* Estimate decay based on the assumption that the distribution is close
+       to Laplacian for large values. We should probably have an adaptive
+       estimate instead. Note: The 2* is a kludge that's not fully understood
+       yet. */
+    OD_ASSERT(*ex_q16 < INT_MAX >> 1);
+    e = ((2**ex_q16 >> 8) + (1 << shift >> 1)) >> shift;
+    decay = OD_MAXI(2, OD_MINI(254, 256*e/(e + 256)));
+    xs += laplace_decode_special(dec, decay, (max == -1) ? -1 : ms - 15, acc_str);
+  }
+  if (shift != 0) {
+    int special;
+    /* Because of the rounding, there's only half the number of possibilities
+       for xs=0 */
+    special = xs == 0;
+    if (shift - special > 0) lsb = od_ec_dec_bits(dec, shift - special, acc_str);
+    lsb -= !special << (shift - 1);
+  }
+  x = (xs << shift) + lsb;
+  generic_model_update(model, ex_q16, x, xs, id, integration);
+  OD_LOG((OD_LOG_ENTROPY_CODER, OD_LOG_DEBUG,
+   "dec: %d %d %d %d %d %x", *ex_q16, x, shift, id, xs, dec->rng));
+  return x;
+}
diff --git a/av1/decoder/laplace_decoder.c b/av1/decoder/laplace_decoder.c
new file mode 100644
index 0000000..4c3def5
--- /dev/null
+++ b/av1/decoder/laplace_decoder.c
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+/* clang-format off */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stdio.h>
+
+#include "aom_dsp/entdec.h"
+#include "av1/common/pvq.h"
+#include "pvq_decoder.h"
+
+#if OD_ACCOUNTING
+# define od_decode_pvq_split(ec, adapt, sum, ctx, str) od_decode_pvq_split_(ec, adapt, sum, ctx, str)
+#else
+# define od_decode_pvq_split(ec, adapt, sum, ctx, str) od_decode_pvq_split_(ec, adapt, sum, ctx)
+#endif
+
+static int od_decode_pvq_split_(od_ec_dec *ec, od_pvq_codeword_ctx *adapt,
+ int sum, int ctx OD_ACC_STR) {
+  int shift;
+  int count;
+  int msbs;
+  int fctx;
+  count = 0;
+  if (sum == 0) return 0;
+  shift = OD_MAXI(0, OD_ILOG(sum) - 3);
+  fctx = 7*ctx + (sum >> shift) - 1;
+  msbs = od_decode_cdf_adapt(ec, adapt->pvq_split_cdf[fctx],
+   (sum >> shift) + 1, adapt->pvq_split_increment, acc_str);
+  if (shift) count = od_ec_dec_bits(ec, shift, acc_str);
+  count += msbs << shift;
+  if (count > sum) {
+    count = sum;
+    ec->error = 1;
+  }
+  return count;
+}
+
+void od_decode_band_pvq_splits(od_ec_dec *ec, od_pvq_codeword_ctx *adapt,
+ od_coeff *y, int n, int k, int level) {
+  int mid;
+  int count_right;
+  if (n == 1) {
+    y[0] = k;
+  }
+  else if (k == 0) {
+    OD_CLEAR(y, n);
+  }
+  else if (k == 1 && n <= 16) {
+    int cdf_id;
+    int pos;
+    cdf_id = od_pvq_k1_ctx(n, level == 0);
+    OD_CLEAR(y, n);
+    pos = od_decode_cdf_adapt(ec, adapt->pvq_k1_cdf[cdf_id], n,
+     adapt->pvq_k1_increment, "pvq:k1");
+    y[pos] = 1;
+  }
+  else {
+    mid = n >> 1;
+    count_right = od_decode_pvq_split(ec, adapt, k, od_pvq_size_ctx(n),
+     "pvq:split");
+    od_decode_band_pvq_splits(ec, adapt, y, mid, k - count_right, level + 1);
+    od_decode_band_pvq_splits(ec, adapt, y + mid, n - mid, count_right,
+     level + 1);
+  }
+}
+
+/** Decodes the tail of a Laplace-distributed variable, i.e. it doesn't
+ * do anything special for the zero case.
+ *
+ * @param [dec] range decoder
+ * @param [decay] decay factor of the distribution, i.e. pdf ~= decay^x
+ * @param [max] maximum possible value of x (used to truncate the pdf)
+ *
+ * @retval decoded variable x
+ */
+int od_laplace_decode_special_(od_ec_dec *dec, unsigned decay, int max OD_ACC_STR) {
+  int pos;
+  int shift;
+  int xs;
+  int ms;
+  int sym;
+  const uint16_t *cdf;
+  shift = 0;
+  if (max == 0) return 0;
+  /* We don't want a large decay value because that would require too many
+     symbols. However, it's OK if the max is below 15. */
+  while (((max >> shift) >= 15 || max == -1) && decay > 235) {
+    decay = (decay*decay + 128) >> 8;
+    shift++;
+  }
+  decay = OD_MINI(decay, 254);
+  decay = OD_MAXI(decay, 2);
+  ms = max >> shift;
+  cdf = EXP_CDF_TABLE[(decay + 1) >> 1];
+  OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "decay = %d\n", decay));
+  xs = 0;
+  do {
+    sym = OD_MINI(xs, 15);
+    {
+      int i;
+      OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "%d %d %d %d", xs, shift, sym, max));
+      for (i = 0; i < 16; i++) {
+        OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "%d ", cdf[i]));
+      }
+      OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "\n"));
+    }
+    if (ms > 0 && ms < 15) {
+      /* Simple way of truncating the pdf when we have a bound. */
+      sym = od_ec_decode_cdf_unscaled(dec, cdf, ms + 1);
+    }
+    else sym = od_ec_decode_cdf_q15(dec, cdf, 16);
+    xs += sym;
+    ms -= 15;
+  }
+  while (sym >= 15 && ms != 0);
+  if (shift) pos = (xs << shift) + od_ec_dec_bits(dec, shift, acc_str);
+  else pos = xs;
+  OD_ASSERT(pos >> shift <= max >> shift || max == -1);
+  if (max != -1 && pos > max) {
+    pos = max;
+    dec->error = 1;
+  }
+  OD_ASSERT(pos <= max || max == -1);
+  return pos;
+}
+
+/** Decodes a Laplace-distributed variable for use in PVQ.
+ *
+ * @param [in,out] dec  range decoder
+ * @param [in]     ExQ8 expectation of the absolute value of x
+ * @param [in]     K    maximum value of |x|
+ *
+ * @retval decoded variable (including sign)
+ */
+int od_laplace_decode_(od_ec_dec *dec, unsigned ex_q8, int k OD_ACC_STR) {
+  int j;
+  int shift;
+  uint16_t cdf[16];
+  int sym;
+  int lsb;
+  int decay;
+  int offset;
+  lsb = 0;
+  /* Shift down x if expectation is too high. */
+  shift = OD_ILOG(ex_q8) - 11;
+  if (shift < 0) shift = 0;
+  /* Apply the shift with rounding to Ex, K and xs. */
+  ex_q8 = (ex_q8 + (1 << shift >> 1)) >> shift;
+  k = (k + (1 << shift >> 1)) >> shift;
+  decay = OD_MINI(254, OD_DIVU(256*ex_q8, (ex_q8 + 256)));
+  offset = LAPLACE_OFFSET[(decay + 1) >> 1];
+  for (j = 0; j < 16; j++) {
+    cdf[j] = EXP_CDF_TABLE[(decay + 1) >> 1][j] - offset;
+  }
+  /* Simple way of truncating the pdf when we have a bound */
+  if (k == 0) sym = 0;
+  else sym = od_ec_decode_cdf_unscaled(dec, cdf, OD_MINI(k + 1, 16));
+  if (shift) {
+    int special;
+    /* Because of the rounding, there's only half the number of possibilities
+       for xs=0 */
+    special = (sym == 0);
+    if (shift - special > 0) lsb = od_ec_dec_bits(dec, shift - special, acc_str);
+    lsb -= (!special << (shift - 1));
+  }
+  /* Handle the exponentially-decaying tail of the distribution */
+  if (sym == 15) sym += laplace_decode_special(dec, decay, k - 15, acc_str);
+  return (sym << shift) + lsb;
+}
+
+#if OD_ACCOUNTING
+# define laplace_decode_vector_delta(dec, y, n, k, curr, means, str) laplace_decode_vector_delta_(dec, y, n, k, curr, means, str)
+#else
+# define laplace_decode_vector_delta(dec, y, n, k, curr, means, str) laplace_decode_vector_delta_(dec, y, n, k, curr, means)
+#endif
+
+static void laplace_decode_vector_delta_(od_ec_dec *dec, od_coeff *y, int n, int k,
+                                        int32_t *curr, const int32_t *means
+                                        OD_ACC_STR) {
+  int i;
+  int prev;
+  int sum_ex;
+  int sum_c;
+  int coef;
+  int pos;
+  int k0;
+  int sign;
+  int first;
+  int k_left;
+  prev = 0;
+  sum_ex = 0;
+  sum_c = 0;
+  coef = 256*means[OD_ADAPT_COUNT_Q8]/
+   (1 + means[OD_ADAPT_COUNT_EX_Q8]);
+  pos = 0;
+  sign = 0;
+  first = 1;
+  k_left = k;
+  for (i = 0; i < n; i++) y[i] = 0;
+  k0 = k_left;
+  coef = OD_MAXI(coef, 1);
+  for (i = 0; i < k0; i++) {
+    int count;
+    if (first) {
+      int decay;
+      int ex = coef*(n - prev)/k_left;
+      if (ex > 65280) decay = 255;
+      else {
+        decay = OD_MINI(255,
+         (int)((256*ex/(ex + 256) + (ex>>5)*ex/((n + 1)*(n - 1)*(n - 1)))));
+      }
+      /*Update mean position.*/
+      count = laplace_decode_special(dec, decay, n - 1, acc_str);
+      first = 0;
+    }
+    else count = laplace_decode(dec, coef*(n - prev)/k_left, n - prev - 1, acc_str);
+    sum_ex += 256*(n - prev);
+    sum_c += count*k_left;
+    pos += count;
+    OD_ASSERT(pos < n);
+    if (y[pos] == 0)
+      sign = od_ec_dec_bits(dec, 1, acc_str);
+    y[pos] += sign ? -1 : 1;
+    prev = pos;
+    k_left--;
+    if (k_left == 0) break;
+  }
+  if (k > 0) {
+    curr[OD_ADAPT_COUNT_Q8] = 256*sum_c;
+    curr[OD_ADAPT_COUNT_EX_Q8] = sum_ex;
+  }
+  else {
+    curr[OD_ADAPT_COUNT_Q8] = -1;
+    curr[OD_ADAPT_COUNT_EX_Q8] = 0;
+  }
+  curr[OD_ADAPT_K_Q8] = 0;
+  curr[OD_ADAPT_SUM_EX_Q8] = 0;
+}
+
+/** Decodes a vector of integers assumed to come from rounding a sequence of
+ * Laplace-distributed real values in decreasing order of variance.
+ *
+ * @param [in,out] dec range decoder
+ * @param [in]     y     decoded vector
+ * @param [in]     N     dimension of the vector
+ * @param [in]     K     sum of the absolute value of components of y
+ * @param [out]    curr  Adaptation context output, may alias means.
+ * @param [in]     means Adaptation context input.
+ */
+void od_laplace_decode_vector_(od_ec_dec *dec, od_coeff *y, int n, int k,
+                           int32_t *curr, const int32_t *means OD_ACC_STR) {
+  int i;
+  int sum_ex;
+  int kn;
+  int exp_q8;
+  int mean_k_q8;
+  int mean_sum_ex_q8;
+  int ran_delta;
+  ran_delta = 0;
+  if (k <= 1) {
+    laplace_decode_vector_delta(dec, y, n, k, curr, means, acc_str);
+    return;
+  }
+  if (k == 0) {
+    curr[OD_ADAPT_COUNT_Q8] = OD_ADAPT_NO_VALUE;
+    curr[OD_ADAPT_COUNT_EX_Q8] = OD_ADAPT_NO_VALUE;
+    curr[OD_ADAPT_K_Q8] = 0;
+    curr[OD_ADAPT_SUM_EX_Q8] = 0;
+    for (i = 0; i < n; i++) y[i] = 0;
+    return;
+  }
+  sum_ex = 0;
+  kn = k;
+  /* Estimates the factor relating pulses_left and positions_left to E(|x|).*/
+  mean_k_q8 = means[OD_ADAPT_K_Q8];
+  mean_sum_ex_q8 = means[OD_ADAPT_SUM_EX_Q8];
+  if (mean_k_q8 < 1 << 23) exp_q8 = 256*mean_k_q8/(1 + mean_sum_ex_q8);
+  else exp_q8 = mean_k_q8/(1 + (mean_sum_ex_q8 >> 8));
+  for (i = 0; i < n; i++) {
+    int ex;
+    int x;
+    if (kn == 0) break;
+    if (kn <= 1 && i != n - 1) {
+      laplace_decode_vector_delta(dec, y + i, n - i, kn, curr, means, acc_str);
+      ran_delta = 1;
+      i = n;
+      break;
+    }
+    /* Expected value of x (round-to-nearest) is
+       expQ8*pulses_left/positions_left. */
+    ex = (2*exp_q8*kn + (n - i))/(2*(n - i));
+    if (ex > kn*256) ex = kn*256;
+    sum_ex += (2*256*kn + (n - i))/(2*(n - i));
+    /* No need to encode the magnitude for the last bin. */
+    if (i != n - 1) x = laplace_decode(dec, ex, kn, acc_str);
+    else x = kn;
+    if (x != 0) {
+      if (od_ec_dec_bits(dec, 1, acc_str)) x = -x;
+    }
+    y[i] = x;
+    kn -= abs(x);
+  }
+  /* Adapting the estimates for expQ8. */
+  if (!ran_delta) {
+    curr[OD_ADAPT_COUNT_Q8] = OD_ADAPT_NO_VALUE;
+    curr[OD_ADAPT_COUNT_EX_Q8] = OD_ADAPT_NO_VALUE;
+  }
+  curr[OD_ADAPT_K_Q8] = k - kn;
+  curr[OD_ADAPT_SUM_EX_Q8] = sum_ex;
+  for (; i < n; i++) y[i] = 0;
+}
diff --git a/av1/decoder/pvq_decoder.c b/av1/decoder/pvq_decoder.c
new file mode 100644
index 0000000..2340605
--- /dev/null
+++ b/av1/decoder/pvq_decoder.c
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "./aom_config.h"
+#include "aom_dsp/entcode.h"
+#include "aom_dsp/entdec.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/partition.h"
+#include "av1/common/pvq_state.h"
+#include "av1/decoder/decint.h"
+#include "av1/decoder/pvq_decoder.h"
+
+static void od_decode_pvq_codeword(od_ec_dec *ec, od_pvq_codeword_ctx *ctx,
+ od_coeff *y, int n, int k) {
+  int i;
+  od_decode_band_pvq_splits(ec, ctx, y, n, k, 0);
+  for (i = 0; i < n; i++) {
+    if (y[i] && od_ec_dec_bits(ec, 1, "pvq:sign")) y[i] = -y[i];
+  }
+}
+
+/** Inverse of neg_interleave; decodes the interleaved gain.
+ *
+ * @param [in]      x      quantized/interleaved gain to decode
+ * @param [in]      ref    quantized gain of the reference
+ * @return                 original quantized gain value
+ */
+static int neg_deinterleave(int x, int ref) {
+  if (x < 2*ref-1) {
+    if (x & 1) return ref - 1 - (x >> 1);
+    else return ref + (x >> 1);
+  }
+  else return x+1;
+}
+
+/** Synthesizes one parition of coefficient values from a PVQ-encoded
+ * vector.
+ *
+ * @param [out]     xcoeff  output coefficient partition (x in math doc)
+ * @param [in]      ypulse  PVQ-encoded values (y in math doc); in the noref
+ *                          case, this vector has n entries, in the
+ *                          reference case it contains n-1 entries
+ *                          (the m-th entry is not included)
+ * @param [in]      ref     reference vector (prediction)
+ * @param [in]      n       number of elements in this partition
+ * @param [in]      gr      gain of the reference vector (prediction)
+ * @param [in]      noref   indicates presence or lack of prediction
+ * @param [in]      g       decoded quantized vector gain
+ * @param [in]      theta   decoded theta (prediction error)
+ * @param [in]      qm      QM with magnitude compensation
+ * @param [in]      qm_inv  Inverse of QM with magnitude compensation
+ */
+static void pvq_synthesis(od_coeff *xcoeff, od_coeff *ypulse, od_val16 *r16,
+ int n, od_val32 gr, int noref, od_val32 g, od_val32 theta, const int16_t *qm_inv,
+ int shift) {
+  int s;
+  int m;
+  /* Sign of the Householder reflection vector */
+  s = 0;
+  /* Direction of the Householder reflection vector */
+  m = noref ? 0 : od_compute_householder(r16, n, gr, &s, shift);
+  od_pvq_synthesis_partial(xcoeff, ypulse, r16, n, noref, g, theta, m, s,
+   qm_inv);
+}
+
+typedef struct {
+  od_coeff *ref;
+  int nb_coeffs;
+  int allow_flip;
+} cfl_ctx;
+
+/** Decodes a single vector of integers (eg, a partition within a
+ *  coefficient block) encoded using PVQ
+ *
+ * @param [in,out] ec      range encoder
+ * @param [in]     q0      scale/quantizer
+ * @param [in]     n       number of coefficients in partition
+ * @param [in,out] model   entropy decoder state
+ * @param [in,out] adapt   adaptation context
+ * @param [in,out] exg     ExQ16 expectation of decoded gain value
+ * @param [in,out] ext     ExQ16 expectation of decoded theta value
+ * @param [in]     ref     'reference' (prediction) vector
+ * @param [out]    out     decoded partition
+ * @param [out]    noref   boolean indicating absence of reference
+ * @param [in]     beta    per-band activity masking beta param
+ * @param [in]     robust  stream is robust to error in the reference
+ * @param [in]     is_keyframe whether we're encoding a keyframe
+ * @param [in]     pli     plane index
+ * @param [in]     cdf_ctx selects which cdf context to use
+ * @param [in,out] skip_rest whether to skip further bands in each direction
+ * @param [in]     band    index of the band being decoded
+ * @param [in]     band    index of the band being decoded
+ * @param [out]    skip    skip flag with range [0,1]
+ * @param [in]     qm      QM with magnitude compensation
+ * @param [in]     qm_inv  Inverse of QM with magnitude compensation
+ */
+static void pvq_decode_partition(od_ec_dec *ec,
+                                 int q0,
+                                 int n,
+                                 generic_encoder model[3],
+                                 od_adapt_ctx *adapt,
+                                 int *exg,
+                                 int *ext,
+                                 od_coeff *ref,
+                                 od_coeff *out,
+                                 int *noref,
+                                 od_val16 beta,
+                                 int robust,
+                                 int is_keyframe,
+                                 int pli,
+                                 int cdf_ctx,
+                                 cfl_ctx *cfl,
+                                 int has_skip,
+                                 int *skip_rest,
+                                 int band,
+                                 int *skip,
+                                 const int16_t *qm,
+                                 const int16_t *qm_inv) {
+  int k;
+  od_val32 qcg;
+  int max_theta;
+  int itheta;
+  od_val32 theta;
+  od_val32 gr;
+  od_val32 gain_offset;
+  od_coeff y[MAXN];
+  int qg;
+  int nodesync;
+  int id;
+  int i;
+  od_val16 ref16[MAXN];
+  int rshift;
+  theta = 0;
+  gr = 0;
+  gain_offset = 0;
+  /* We always use the robust bitstream for keyframes to avoid having
+     PVQ and entropy decoding depending on each other, hurting parallelism. */
+  nodesync = robust || is_keyframe;
+  /* Skip is per-direction. For band=0, we can use any of the flags. */
+  if (skip_rest[(band + 2) % 3]) {
+    qg = 0;
+    if (is_keyframe) {
+      itheta = -1;
+      *noref = 1;
+    }
+    else {
+      itheta = 0;
+      *noref = 0;
+    }
+  }
+  else {
+    /* Jointly decode gain, itheta and noref for small values. Then we handle
+       larger gain. We need to wait for itheta because in the !nodesync case
+       it depends on max_theta, which depends on the gain. */
+    id = od_decode_cdf_adapt(ec, &adapt->pvq.pvq_gaintheta_cdf[cdf_ctx][0],
+     8 + 7*has_skip, adapt->pvq.pvq_gaintheta_increment,
+     "pvq:gaintheta");
+    if (!is_keyframe && id >= 10) id++;
+    if (is_keyframe && id >= 8) id++;
+    if (id >= 8) {
+      id -= 8;
+      skip_rest[0] = skip_rest[1] = skip_rest[2] = 1;
+    }
+    qg = id & 1;
+    itheta = (id >> 1) - 1;
+    *noref = (itheta == -1);
+  }
+  /* The CfL flip bit is only decoded on the first band that has noref=0. */
+  if (cfl->allow_flip && !*noref) {
+    int flip;
+    flip = od_ec_dec_bits(ec, 1, "cfl:flip");
+    if (flip) {
+      for (i = 0; i < cfl->nb_coeffs; i++) cfl->ref[i] = -cfl->ref[i];
+    }
+    cfl->allow_flip = 0;
+  }
+  if (qg > 0) {
+    int tmp;
+    tmp = *exg;
+    qg = 1 + generic_decode(ec, &model[!*noref], -1, &tmp, 2, "pvq:gain");
+    OD_IIR_DIADIC(*exg, qg << 16, 2);
+  }
+  *skip = 0;
+#if defined(OD_FLOAT_PVQ)
+  rshift = 0;
+#else
+  /* Shift needed to make the reference fit in 15 bits, so that the Householder
+     vector can fit in 16 bits. */
+  rshift = OD_MAXI(0, od_vector_log_mag(ref, n) - 14);
+#endif
+  for (i = 0; i < n; i++) {
+#if defined(OD_FLOAT_PVQ)
+    ref16[i] = ref[i]*(double)qm[i]*OD_QM_SCALE_1;
+#else
+    ref16[i] = OD_SHR_ROUND(ref[i]*qm[i], OD_QM_SHIFT + rshift);
+#endif
+  }
+  if(!*noref){
+    /* we have a reference; compute its gain */
+    od_val32 cgr;
+    int icgr;
+    int cfl_enabled;
+    cfl_enabled = pli != 0 && is_keyframe && !OD_DISABLE_CFL;
+    cgr = od_pvq_compute_gain(ref16, n, q0, &gr, beta, rshift);
+    if (cfl_enabled) cgr = OD_CGAIN_SCALE;
+#if defined(OD_FLOAT_PVQ)
+    icgr = (int)floor(.5 + cgr);
+#else
+    icgr = OD_SHR_ROUND(cgr, OD_CGAIN_SHIFT);
+#endif
+    /* quantized gain is interleave encoded when there's a reference;
+       deinterleave it now */
+    if (is_keyframe) qg = neg_deinterleave(qg, icgr);
+    else {
+      qg = neg_deinterleave(qg, icgr + 1) - 1;
+      if (qg == 0) *skip = (icgr ? OD_PVQ_SKIP_ZERO : OD_PVQ_SKIP_COPY);
+    }
+    if (qg == icgr && itheta == 0 && !cfl_enabled) *skip = OD_PVQ_SKIP_COPY;
+    gain_offset = cgr - OD_SHL(icgr, OD_CGAIN_SHIFT);
+    qcg = OD_SHL(qg, OD_CGAIN_SHIFT) + gain_offset;
+    /* read and decode first-stage PVQ error theta */
+    max_theta = od_pvq_compute_max_theta(qcg, beta);
+    if (itheta > 1 && (nodesync || max_theta > 3)) {
+      int tmp;
+      tmp = *ext;
+      itheta = 2 + generic_decode(ec, &model[2], nodesync ? -1 : max_theta - 3,
+       &tmp, 2, "pvq:theta");
+      OD_IIR_DIADIC(*ext, itheta << 16, 2);
+    }
+    theta = od_pvq_compute_theta(itheta, max_theta);
+  }
+  else{
+    itheta = 0;
+    if (!is_keyframe) qg++;
+    qcg = OD_SHL(qg, OD_CGAIN_SHIFT);
+    if (qg == 0) *skip = OD_PVQ_SKIP_ZERO;
+  }
+
+  k = od_pvq_compute_k(qcg, itheta, theta, *noref, n, beta, nodesync);
+  if (k != 0) {
+    /* when noref==0, y is actually size n-1 */
+    od_decode_pvq_codeword(ec, &adapt->pvq.pvq_codeword_ctx, y, n - !*noref,
+     k);
+  }
+  else {
+    OD_CLEAR(y, n);
+  }
+  if (*skip) {
+    if (*skip == OD_PVQ_SKIP_COPY) OD_COPY(out, ref, n);
+    else OD_CLEAR(out, n);
+  }
+  else {
+    od_val32 g;
+    g = od_gain_expand(qcg, q0, beta);
+    pvq_synthesis(out, y, ref16, n, gr, *noref, g, theta, qm_inv, rshift);
+  }
+  *skip = !!*skip;
+}
+
+/** Decodes a coefficient block (except for DC) encoded using PVQ
+ *
+ * @param [in,out] dec     daala decoder context
+ * @param [in]     ref     'reference' (prediction) vector
+ * @param [out]    out     decoded partition
+ * @param [in]     q0      quantizer
+ * @param [in]     pli     plane index
+ * @param [in]     bs      log of the block size minus two
+ * @param [in]     beta    per-band activity masking beta param
+ * @param [in]     robust  stream is robust to error in the reference
+ * @param [in]     is_keyframe whether we're encoding a keyframe
+ * @param [out]    flags   bitmask of the per band skip and noref flags
+ * @param [in]     block_skip skip flag for the block (range 0-3)
+ * @param [in]     qm      QM with magnitude compensation
+ * @param [in]     qm_inv  Inverse of QM with magnitude compensation
+ */
+void od_pvq_decode(daala_dec_ctx *dec,
+                   od_coeff *ref,
+                   od_coeff *out,
+                   int q0,
+                   int pli,
+                   int bs,
+                   const od_val16 *beta,
+                   int robust,
+                   int is_keyframe,
+                   unsigned int *flags,
+                   int block_skip,
+                   const int16_t *qm,
+                   const int16_t *qm_inv){
+
+  int noref[PVQ_MAX_PARTITIONS];
+  int skip[PVQ_MAX_PARTITIONS];
+  int *exg;
+  int *ext;
+  int nb_bands;
+  int i;
+  const int *off;
+  int size[PVQ_MAX_PARTITIONS];
+  generic_encoder *model;
+  int skip_rest[3] = {0};
+  cfl_ctx cfl;
+  /* const unsigned char *pvq_qm; */
+  /*Default to skip=1 and noref=0 for all bands.*/
+  for (i = 0; i < PVQ_MAX_PARTITIONS; i++) {
+    noref[i] = 0;
+    skip[i] = 1;
+  }
+  /* TODO(yushin): Enable this for activity masking,
+     when pvq_qm_q4 is available in AOM. */
+  /*pvq_qm = &dec->state.pvq_qm_q4[pli][0];*/
+  exg = &dec->state.adapt.pvq.pvq_exg[pli][bs][0];
+  ext = dec->state.adapt.pvq.pvq_ext + bs*PVQ_MAX_PARTITIONS;
+  model = dec->state.adapt.pvq.pvq_param_model;
+  nb_bands = OD_BAND_OFFSETS[bs][0];
+  off = &OD_BAND_OFFSETS[bs][1];
+  OD_ASSERT(block_skip < 4);
+  out[0] = block_skip & 1;
+  if (!(block_skip >> 1)) {
+    if (is_keyframe) for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = 0;
+    else for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = ref[i];
+  }
+  else {
+    for (i = 0; i < nb_bands; i++) size[i] = off[i+1] - off[i];
+    cfl.ref = ref;
+    cfl.nb_coeffs = off[nb_bands];
+    cfl.allow_flip = pli != 0 && is_keyframe;
+    for (i = 0; i < nb_bands; i++) {
+      int q;
+      /* TODO(yushin): Enable this for activity masking,
+         when pvq_qm_q4 is available in AOM. */
+      /*q = OD_MAXI(1, q0*pvq_qm[od_qm_get_index(bs, i + 1)] >> 4);*/
+      q = OD_MAXI(1, q0);
+      pvq_decode_partition(dec->ec, q, size[i],
+       model, &dec->state.adapt, exg + i, ext + i, ref + off[i], out + off[i],
+       &noref[i], beta[i], robust, is_keyframe, pli,
+       (pli != 0)*OD_NBSIZES*PVQ_MAX_PARTITIONS + bs*PVQ_MAX_PARTITIONS + i,
+       &cfl, i == 0 && (i < nb_bands - 1), skip_rest, i, &skip[i],
+       qm + off[i], qm_inv + off[i]);
+      if (i == 0 && !skip_rest[0] && bs > 0) {
+        int skip_dir;
+        int j;
+        skip_dir = od_decode_cdf_adapt(dec->ec,
+         &dec->state.adapt.pvq.pvq_skip_dir_cdf[(pli != 0) + 2*(bs - 1)][0], 7,
+         dec->state.adapt.pvq.pvq_skip_dir_increment, "pvq:skiprest");
+        for (j = 0; j < 3; j++) skip_rest[j] = !!(skip_dir & (1 << j));
+      }
+    }
+  }
+  *flags = 0;
+  for (i = nb_bands - 1; i >= 0; i--) {
+    *flags <<= 1;
+    *flags |= noref[i]&1;
+    *flags <<= 1;
+    *flags |= skip[i]&1;
+  }
+}
diff --git a/av1/decoder/pvq_decoder.h b/av1/decoder/pvq_decoder.h
new file mode 100644
index 0000000..d749040
--- /dev/null
+++ b/av1/decoder/pvq_decoder.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#if !defined(_pvq_decoder_H)
+# define _pvq_decoder_H (1)
+# include "aom_dsp/entdec.h"
+# include "av1/common/pvq.h"
+# include "av1/decoder/decint.h"
+
+void od_decode_band_pvq_splits(od_ec_dec *ec, od_pvq_codeword_ctx *adapt,
+ od_coeff *y, int n, int k, int level);
+
+#if OD_ACCOUNTING
+# define laplace_decode_special(dec, decay, max, str) od_laplace_decode_special_(dec, decay, max, str)
+# define laplace_decode(dec, ex_q8, k, str) od_laplace_decode_(dec, ex_q8, k, str)
+#define laplace_decode_vector(dec, y, n, k, curr, means, str) od_laplace_decode_vector_(dec, y, n, k, curr, means, str)
+#else
+# define laplace_decode_special(dec, decay, max, str) od_laplace_decode_special_(dec, decay, max)
+# define laplace_decode(dec, ex_q8, k, str) od_laplace_decode_(dec, ex_q8, k)
+#define laplace_decode_vector(dec, y, n, k, curr, means, str) od_laplace_decode_vector_(dec, y, n, k, curr, means)
+#endif
+
+int od_laplace_decode_special_(od_ec_dec *dec, unsigned decay, int max OD_ACC_STR);
+int od_laplace_decode_(od_ec_dec *dec, unsigned ex_q8, int k OD_ACC_STR);
+void od_laplace_decode_vector_(od_ec_dec *dec, od_coeff *y, int n, int k,
+                                  int32_t *curr, const int32_t *means
+                                  OD_ACC_STR);
+
+
+void od_pvq_decode(daala_dec_ctx *dec, od_coeff *ref, od_coeff *out, int q0,
+ int pli, int bs, const od_val16 *beta, int robust, int is_keyframe,
+ unsigned int *flags, int block_skip, const int16_t *qm,
+ const int16_t *qm_inv);
+
+#endif