New experiment: Perceptual Vector Quantization from Daala

PVQ replaces the scalar quantizer and coefficient coding with a new
design originally developed in Daala. It currently depends on the
Daala entropy coder although it could be adapted to work with another
entropy coder if needed:
./configure --enable-experimental --enable-daala_ec --enable-pvq

The version of PVQ in this commit is adapted from the following
revision of Daala:

More information about PVQ:

The following files are copied as-is from Daala with minimal
adaptations, therefore we disable clang-format on those files
to make it easier to synchronize the AV1 and Daala codebases in the future:

Known issues:
- Lossless mode is not supported, '--lossless=1' will give the same result as
'--end-usage=q --cq-level=1'.
- High bit depth is not supported by PVQ.

Change-Id: I1ae0d6517b87f4c1ccea944b2e12dc906979f25e
diff --git a/aom_dsp/ b/aom_dsp/
index 5025556..34be453 100644
--- a/aom_dsp/
+++ b/aom_dsp/
@@ -212,6 +212,24 @@
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c
+ifeq ($(CONFIG_PVQ),yes)
+DSP_SRCS-yes            += fwd_txfm.c
+DSP_SRCS-yes            += fwd_txfm.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_impl_sse2.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32x32_impl_sse2.h
+ifeq ($(ARCH_X86_64),yes)
+DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
+DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
+DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
+DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
+DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c
+endif  # CONFIG_PVQ
 # inverse transform
 ifneq ($(filter yes,$(CONFIG_AV1)),)
 DSP_SRCS-yes            += inv_txfm.h
diff --git a/aom_dsp/ b/aom_dsp/
index dc9a6b9..53af943 100644
--- a/aom_dsp/
+++ b/aom_dsp/
@@ -595,7 +595,7 @@
 # Forward transform
-if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+if ((aom_config("CONFIG_AV1_ENCODER") eq "yes") || (aom_config("CONFIG_PVQ") eq "yes")){
 if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/aom_fdct4x4 sse2/;
diff --git a/aom_dsp/daalaboolreader.h b/aom_dsp/daalaboolreader.h
index 9d6cebd..8977995 100644
--- a/aom_dsp/daalaboolreader.h
+++ b/aom_dsp/daalaboolreader.h
@@ -41,7 +41,7 @@
 static INLINE int aom_daala_read(daala_reader *r, int prob) {
   if (prob == 128) {
-    return od_ec_dec_bits(&r->ec, 1);
+    return od_ec_dec_bits(&r->ec, 1, "aom_bits");
   } else {
     int p = ((prob << 15) + (256 - prob)) >> 8;
     return od_ec_decode_bool_q15(&r->ec, p);
diff --git a/aom_dsp/entcode.c b/aom_dsp/entcode.c
index 49284b0..ff8e8e2 100644
--- a/aom_dsp/entcode.c
+++ b/aom_dsp/entcode.c
@@ -1,26 +1,13 @@
-/*Daala video codec
-Copyright (c) 2001-2012 Daala project contributors.  All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-- Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-- Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
 #include "./config.h"
diff --git a/aom_dsp/entcode.h b/aom_dsp/entcode.h
index 77ed171..91fcb67 100644
--- a/aom_dsp/entcode.h
+++ b/aom_dsp/entcode.h
@@ -1,26 +1,13 @@
-/*Daala video codec
-Copyright (c) 2001-2013 Daala project contributors.  All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-- Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-- Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
 #if !defined(_entcode_H)
 #define _entcode_H (1)
diff --git a/aom_dsp/entdec.c b/aom_dsp/entdec.c
index 18563b2..b015956 100644
--- a/aom_dsp/entdec.c
+++ b/aom_dsp/entdec.c
@@ -1,26 +1,13 @@
-/*Daala video codec
-Copyright (c) 2001-2013 Daala project contributors.  All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-- Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-- Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
 #include "./config.h"
@@ -440,7 +427,7 @@
     ftb = OD_ILOG_NZ(ft) - OD_EC_UINT_BITS;
     ft1 = (int)(ft >> ftb) + 1;
     t = od_ec_decode_cdf_q15(dec, OD_UNIFORM_CDF_Q15(ft1), ft1);
-    t = t << ftb | od_ec_dec_bits(dec, ftb);
+    t = t << ftb | od_ec_dec_bits(dec, ftb, "");
     if (t <= ft) return t;
     dec->error = 1;
     return ft;
@@ -453,7 +440,7 @@
   ftb: The number of bits to extract.
        This must be between 0 and 25, inclusive.
   Return: The decoded bits.*/
-uint32_t od_ec_dec_bits(od_ec_dec *dec, unsigned ftb) {
+uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb) {
   od_ec_window window;
   int available;
   uint32_t ret;
diff --git a/aom_dsp/entdec.h b/aom_dsp/entdec.h
index 80363b5..6d6e2b5 100644
--- a/aom_dsp/entdec.h
+++ b/aom_dsp/entdec.h
@@ -1,26 +1,13 @@
-/*Daala video codec
-Copyright (c) 2001-2013 Daala project contributors.  All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-- Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-- Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
 #if !defined(_entdec_H)
 #define _entdec_H (1)
@@ -33,6 +20,14 @@
 typedef struct od_ec_dec od_ec_dec;
+#define OD_ACC_STR , char *acc_str
+#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb, str)
+#define OD_ACC_STR
+#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb)
 /*The entropy decoder context.*/
 struct od_ec_dec {
   /*The start of the current input buffer.*/
@@ -91,7 +86,7 @@
 OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_uint(od_ec_dec *dec, uint32_t ft)
-OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_bits(od_ec_dec *dec, unsigned ftb)
+OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb)
 OD_WARN_UNUSED_RESULT int od_ec_dec_tell(const od_ec_dec *dec)
diff --git a/aom_dsp/entenc.c b/aom_dsp/entenc.c
index 3e9cb62..390f61b 100644
--- a/aom_dsp/entenc.c
+++ b/aom_dsp/entenc.c
@@ -1,26 +1,13 @@
-/*Daala video codec
-Copyright (c) 2001-2013 Daala project contributors.  All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-- Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-- Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
 #include "./config.h"
diff --git a/aom_dsp/entenc.h b/aom_dsp/entenc.h
index 32163f7..5e121b6 100644
--- a/aom_dsp/entenc.h
+++ b/aom_dsp/entenc.h
@@ -1,26 +1,13 @@
-/*Daala video codec
-Copyright (c) 2001-2013 Daala project contributors.  All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-- Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-- Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
 #if !defined(_entenc_H)
 #define _entenc_H (1)
diff --git a/av1/ b/av1/
index 75a2569..e254ddc 100644
--- a/av1/
+++ b/av1/
@@ -96,6 +96,24 @@
 AV1_COMMON_SRCS-yes += common/odintrin.c
 AV1_COMMON_SRCS-yes += common/odintrin.h
+ifeq ($(CONFIG_PVQ),yes)
+# PVQ from daala
+AV1_COMMON_SRCS-yes += common/pvq.c
+AV1_COMMON_SRCS-yes += common/pvq.h
+AV1_COMMON_SRCS-yes += common/partition.c
+AV1_COMMON_SRCS-yes += common/partition.h
+AV1_COMMON_SRCS-yes += common/zigzag4.c
+AV1_COMMON_SRCS-yes += common/zigzag8.c
+AV1_COMMON_SRCS-yes += common/zigzag16.c
+AV1_COMMON_SRCS-yes += common/zigzag32.c
+AV1_COMMON_SRCS-yes += common/zigzag.h
+AV1_COMMON_SRCS-yes += common/generic_code.c
+AV1_COMMON_SRCS-yes += common/generic_code.h
+AV1_COMMON_SRCS-yes += common/pvq_state.c
+AV1_COMMON_SRCS-yes += common/pvq_state.h
+AV1_COMMON_SRCS-yes += common/laplace_tables.c
 AV1_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/itrans4_dspr2.c
 AV1_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/itrans8_dspr2.c
diff --git a/av1/ b/av1/
index 7ec294b..1dc31a4 100644
--- a/av1/
+++ b/av1/
@@ -96,6 +96,16 @@
 AV1_CX_SRCS-$(HAVE_NEON) += encoder/clpf_rdo_neon.c
+ifeq ($(CONFIG_PVQ),yes)
+# PVQ from daala
+AV1_CX_SRCS-yes += encoder/daala_compat_enc.c
+AV1_CX_SRCS-yes += encoder/pvq_encoder.c
+AV1_CX_SRCS-yes += encoder/pvq_encoder.h
+AV1_CX_SRCS-yes += encoder/encint.h
+AV1_CX_SRCS-yes += encoder/generic_encoder.c
+AV1_CX_SRCS-yes += encoder/laplace_encoder.c
 AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
diff --git a/av1/ b/av1/
index 362e7c6..24decc7 100644
--- a/av1/
+++ b/av1/
@@ -32,4 +32,29 @@
 AV1_DX_SRCS-yes += decoder/dsubexp.c
 AV1_DX_SRCS-yes += decoder/dsubexp.h
+ifeq ($(CONFIG_PVQ),yes)
+# PVQ from daala
+AV1_DX_SRCS-yes += decoder/pvq_decoder.c
+AV1_DX_SRCS-yes += decoder/pvq_decoder.h
+AV1_DX_SRCS-yes += decoder/decint.h
+AV1_DX_SRCS-yes += decoder/generic_decoder.c
+AV1_DX_SRCS-yes += decoder/laplace_decoder.c
+AV1_DX_SRCS-yes += encoder/hybrid_fwd_txfm.c
+AV1_DX_SRCS-yes += encoder/hybrid_fwd_txfm.h
+AV1_DX_SRCS-yes += encoder/dct.c
+AV1_DX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
+AV1_DX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_intrin_sse2.c
+AV1_DX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c
+AV1_DX_SRCS-$(HAVE_NEON) += encoder/arm/neon/dct_neon.c
+AV1_DX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct4x4_msa.c
+AV1_DX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct8x8_msa.c
+AV1_DX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct16x16_msa.c
+AV1_DX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct_msa.h
 AV1_DX_SRCS-yes := $(filter-out $(AV1_DX_SRCS_REMOVE-yes),$(AV1_DX_SRCS-yes))
diff --git a/av1/common/ b/av1/common/
index d939c44..6b7623b 100644
--- a/av1/common/
+++ b/av1/common/
@@ -633,6 +633,196 @@
 # end encoder functions
+# If PVQ is enabled, fwd transforms are required by decoder
+if (aom_config("CONFIG_PVQ") eq "yes") {
+# fdct functions
+if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_fht4x4 sse2/;
+  add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_fht8x8 sse2/;
+  add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_fht16x16 sse2/;
+  add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/av1_fwht4x4 sse2/;
+  if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+    add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct4x4/;
+    add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct4x4_1/;
+    add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct8x8/;
+    add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct8x8_1/;
+    add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct16x16/;
+    add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct16x16_1/;
+    add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct32x32/;
+    add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct32x32_rd/;
+    add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct32x32_1/;
+    add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_highbd_fdct4x4/;
+    add_proto qw/void av1_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_highbd_fdct8x8/;
+    add_proto qw/void av1_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_highbd_fdct8x8_1/;
+    add_proto qw/void av1_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_highbd_fdct16x16/;
+    add_proto qw/void av1_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_highbd_fdct16x16_1/;
+    add_proto qw/void av1_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_highbd_fdct32x32/;
+    add_proto qw/void av1_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_highbd_fdct32x32_rd/;
+    add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_highbd_fdct32x32_1/;
+  } else {
+    add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct4x4 sse2/;
+    add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct4x4_1 sse2/;
+    add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct8x8 sse2/;
+    add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct8x8_1 sse2/;
+    add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct16x16 sse2/;
+    add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct16x16_1 sse2/;
+    add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct32x32 sse2/;
+    add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct32x32_rd sse2/;
+    add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct32x32_1 sse2/;
+    add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_highbd_fdct4x4 sse2/;
+    add_proto qw/void av1_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_highbd_fdct8x8 sse2/;
+    add_proto qw/void av1_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_highbd_fdct8x8_1/;
+    add_proto qw/void av1_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_highbd_fdct16x16 sse2/;
+    add_proto qw/void av1_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_highbd_fdct16x16_1/;
+    add_proto qw/void av1_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_highbd_fdct32x32 sse2/;
+    add_proto qw/void av1_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_highbd_fdct32x32_rd sse2/;
+    add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_highbd_fdct32x32_1/;
+  }
+} else {
+  add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_fht4x4 sse2 msa/;
+  add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_fht8x8 sse2 msa/;
+  add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_fht16x16 sse2 msa/;
+  add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/av1_fwht4x4 msa sse2/;
+  if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+    add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct4x4/;
+    add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct4x4_1/;
+    add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct8x8/;
+    add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct8x8_1/;
+    add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct16x16/;
+    add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct16x16_1/;
+    add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct32x32/;
+    add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct32x32_rd/;
+    add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct32x32_1/;
+  } else {
+    add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct4x4 sse2/;
+    add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct4x4_1 sse2/;
+    add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct8x8 sse2/;
+    add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct8x8_1 sse2/;
+    add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct16x16 sse2/;
+    add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct16x16_1 sse2/;
+    add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct32x32 sse2/;
+    add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct32x32_rd sse2/;
+    add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/av1_fdct32x32_1 sse2/;
+  }
 # Deringing Functions
 if (aom_config("CONFIG_DERING") eq "yes") {
diff --git a/av1/common/blockd.c b/av1/common/blockd.c
index 435f7bd..932bdf2 100644
--- a/av1/common/blockd.c
+++ b/av1/common/blockd.c
@@ -86,6 +86,7 @@
     av1_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
 void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                       TX_SIZE tx_size, int has_eob, int aoff, int loff) {
   ENTROPY_CONTEXT *const a = pd->above_context + aoff;
@@ -121,6 +122,7 @@
     memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
 void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y) {
   int i;
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index d46ba57..5a1a230 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -28,6 +28,11 @@
 #include "av1/common/scale.h"
 #include "av1/common/seg_common.h"
 #include "av1/common/tile_common.h"
+#include "av1/common/pvq.h"
+#include "av1/common/pvq_state.h"
+#include "av1/decoder/decint.h"
 #ifdef __cplusplus
 extern "C" {
@@ -87,6 +92,33 @@
   return mode >= NEARESTMV && mode <= NEWMV;
+typedef struct PVQ_INFO {
+  int theta[PVQ_MAX_PARTITIONS];
+  int max_theta[PVQ_MAX_PARTITIONS];
+  od_coeff y[OD_BSIZE_MAX * OD_BSIZE_MAX];
+  int nb_bands;
+  int size[PVQ_MAX_PARTITIONS];
+  int skip_rest;
+  int skip_dir;
+  int bs;           // log of the block size minus two,
+                    // i.e. equivalent to aom's TX_SIZE
+  int ac_dc_coded;  // block skip info, indicating whether DC/AC is coded.
+                    // bit0: DC coded, bit1 : AC coded (1 means coded)
+  tran_low_t dq_dc_residue;
+typedef struct PVQ_QUEUE {
+  PVQ_INFO *buf;  // buffer for pvq info, stored in encoding order
+  int curr_pos;   // curr position to write PVQ_INFO
+  int buf_len;    // allocated buffer length
+  int last_pos;   // last written position of PVQ_INFO in a tile
 /* For keyframes, intra block modes are predicted by the (already decoded)
    modes for the Y blocks to the left and above us; for interframes, there
    is a single probability table. */
@@ -223,6 +255,12 @@
   const qm_val_t *seg_qmatrix[MAX_SEGMENTS][2][TX_SIZES];
+  DECLARE_ALIGNED(16, int16_t, pred[MAX_SB_SQUARE]);
+  // PVQ: forward transformed predicted image, a reference for PVQ.
+  tran_low_t *pvq_ref_coeff;
 #define BLOCK_OFFSET(x, i) ((x) + (i)*16)
@@ -282,6 +320,9 @@
   PARTITION_CONTEXT *above_seg_context;
   PARTITION_CONTEXT left_seg_context[8];
+  daala_dec_ctx daala_dec;
   /* Bit depth: 8, 10, 12 */
   int bd;
diff --git a/av1/common/generic_code.c b/av1/common/generic_code.c
new file mode 100644
index 0000000..4022cf1
--- /dev/null
+++ b/av1/common/generic_code.c
@@ -0,0 +1,145 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+/* clang-format off */
+# include "config.h"
+#include "generic_code.h"
+void od_cdf_init(uint16_t *cdf, int ncdfs, int nsyms, int val, int first) {
+  int i;
+  int j;
+  for (i = 0; i < ncdfs; i++) {
+    for (j = 0; j < nsyms; j++) {
+      cdf[i*nsyms + j] = val*j + first;
+    }
+  }
+/** Adapts a Q15 cdf after encoding/decoding a symbol. */
+void od_cdf_adapt_q15(int val, uint16_t *cdf, int n, int *count, int rate) {
+  int i;
+  *count = OD_MINI(*count + 1, 1 << rate);
+  OD_ASSERT(cdf[n - 1] == 32768);
+  if (*count >= 1 << rate) {
+    /* Steady-state adaptation based on a simple IIR with dyadic rate. */
+    for (i = 0; i < n; i++) {
+      int tmp;
+      /* When (i < val), we want the adjustment ((cdf[i] - tmp) >> rate) to be
+         positive so long as (cdf[i] > i + 1), and 0 when (cdf[i] == i + 1),
+         to ensure we don't drive any probabilities to 0. Replacing cdf[i] with
+         (i + 2) and solving ((i + 2 - tmp) >> rate == 1) for tmp produces
+         tmp == i + 2 - (1 << rate). Using this value of tmp with
+         cdf[i] == i + 1 instead gives an adjustment of 0 as desired.
+         When (i >= val), we want ((cdf[i] - tmp) >> rate) to be negative so
+         long as cdf[i] < 32768 - (n - 1 - i), and 0 when
+         cdf[i] == 32768 - (n - 1 - i), again to ensure we don't drive any
+         probabilities to 0. Since right-shifting any negative value is still
+         negative, we can solve (32768 - (n - 1 - i) - tmp == 0) for tmp,
+         producing tmp = 32769 - n + i. Using this value of tmp with smaller
+         values of cdf[i] instead gives negative adjustments, as desired.
+         Combining the two cases gives the expression below. These could be
+         stored in a lookup table indexed by n and rate to avoid the
+         arithmetic. */
+      tmp = 2 - (1<<rate) + i + (32767 + (1<<rate) - n)*(i >= val);
+      cdf[i] -= (cdf[i] - tmp) >> rate;
+    }
+  }
+  else {
+    int alpha;
+    /* Initial adaptation for the first symbols. The adaptation rate is
+       computed to be equivalent to what od_{en,de}code_cdf_adapt() does
+       when the initial cdf is set to increment/4. */
+    alpha = 4*32768/(n + 4**count);
+    for (i = 0; i < n; i++) {
+      int tmp;
+      tmp = (32768 - n)*(i >= val) + i + 1;
+      cdf[i] -= ((cdf[i] - tmp)*alpha) >> 15;
+    }
+  }
+  OD_ASSERT(cdf[n - 1] == 32768);
+/** Initializes the cdfs and freq counts for a model.
+ *
+ * @param [out] model model being initialized
+ */
+void generic_model_init(generic_encoder *model) {
+  int i;
+  int j;
+  model->increment = 64;
+  for (i = 0; i < GENERIC_TABLES; i++) {
+    for (j = 0; j < 16; j++) {
+      /* Do flat initialization equivalent to a single symbol in each bin. */
+      model->cdf[i][j] = (j + 1) * model->increment;
+    }
+  }
+/** Takes the base-2 log of E(x) in Q1.
+ *
+ * @param [in] ExQ16 expectation of x in Q16
+ *
+ * @retval 2*log2(ExQ16/2^16)
+ */
+int log_ex(int ex_q16) {
+  int lg;
+  int lg_q1;
+  int odd;
+  lg = OD_ILOG(ex_q16);
+  if (lg < 15) {
+    odd = ex_q16*ex_q16 > 2 << 2*lg;
+  }
+  else {
+    int tmp;
+    tmp = ex_q16 >> (lg - 8);
+    odd = tmp*tmp > (1 << 15);
+  }
+  lg_q1 = OD_MAXI(0, 2*lg - 33 + odd);
+  return lg_q1;
+/** Updates the probability model based on the encoded/decoded value
+ *
+ * @param [in,out] model generic prob model
+ * @param [in,out] ExQ16 expectation of x
+ * @param [in]     x     variable encoded/decoded (used for ExQ16)
+ * @param [in]     xs    variable x after shift (used for the model)
+ * @param [in]     id    id of the icdf to adapt
+ * @param [in]     integration integration period of ExQ16 (leaky average over
+ * 1<<integration samples)
+ */
+void generic_model_update(generic_encoder *model, int *ex_q16, int x, int xs,
+ int id, int integration) {
+  int i;
+  int xenc;
+  uint16_t *cdf;
+  cdf = model->cdf[id];
+  /* Renormalize if we cannot add increment */
+  if (cdf[15] + model->increment > 32767) {
+    for (i = 0; i < 16; i++) {
+      /* Second term ensures that the pdf is non-null */
+      cdf[i] = (cdf[i] >> 1) + i + 1;
+    }
+  }
+  /* Update freq count */
+  xenc = OD_MINI(15, xs);
+  /* This can be easily vectorized */
+  for (i = xenc; i < 16; i++) cdf[i] += model->increment;
+  /* We could have saturated ExQ16 directly, but this is safe and simpler */
+  x = OD_MINI(x, 32767);
+  OD_IIR_DIADIC(*ex_q16, x << 16, integration);
diff --git a/av1/common/generic_code.h b/av1/common/generic_code.h
new file mode 100644
index 0000000..6059190
--- /dev/null
+++ b/av1/common/generic_code.h
@@ -0,0 +1,86 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+/* clang-format off */
+#if !defined(_generic_code_H)
+# define _generic_code_H
+# include "aom_dsp/entdec.h"
+# include "aom_dsp/entenc.h"
+# define GENERIC_TABLES 12
+# define generic_decode(dec, model, max, ex_q16, integration, str) generic_decode_(dec, model, max, ex_q16, integration, str)
+# define od_decode_cdf_adapt_q15(ec, cdf, n, count, rate, str) od_decode_cdf_adapt_q15_(ec, cdf, n, count, rate, str)
+# define od_decode_cdf_adapt(ec, cdf, n, increment, str) od_decode_cdf_adapt_(ec, cdf, n, increment, str)
+# define generic_decode(dec, model, max, ex_q16, integration, str) generic_decode_(dec, model, max, ex_q16, integration)
+# define od_decode_cdf_adapt_q15(ec, cdf, n, count, rate, str) od_decode_cdf_adapt_q15_(ec, cdf, n, count, rate)
+# define od_decode_cdf_adapt(ec, cdf, n, increment, str) od_decode_cdf_adapt_(ec, cdf, n, increment)
+typedef struct {
+  /** cdf for multiple expectations of x */
+  uint16_t cdf[GENERIC_TABLES][16];
+  /** Frequency increment for learning the cdfs */
+  int increment;
+} generic_encoder;
+#define OD_IIR_DIADIC(y, x, shift) ((y) += ((x) - (y)) >> (shift))
+void generic_model_init(generic_encoder *model);
+#define OD_CDFS_INIT(cdf, val) od_cdf_init(&cdf[0][0],\
+ sizeof(cdf)/sizeof(cdf[0]), sizeof(cdf[0])/sizeof(cdf[0][0]), val, val)
+#define OD_CDFS_INIT_FIRST(cdf, val, first) od_cdf_init(&cdf[0][0],\
+ sizeof(cdf)/sizeof(cdf[0]), sizeof(cdf[0])/sizeof(cdf[0][0]), val, first)
+#define OD_SINGLE_CDF_INIT(cdf, val) od_cdf_init(cdf,\
+ 1, sizeof(cdf)/sizeof(cdf[0]), val, val)
+#define OD_SINGLE_CDF_INIT_FIRST(cdf, val, first) od_cdf_init(cdf,\
+ 1, sizeof(cdf)/sizeof(cdf[0]), val, first)
+void od_cdf_init(uint16_t *cdf, int ncdfs, int nsyms, int val, int first);
+void od_cdf_adapt_q15(int val, uint16_t *cdf, int n, int *count, int rate);
+void od_encode_cdf_adapt_q15(od_ec_enc *ec, int val, uint16_t *cdf, int n,
+ int *count, int rate);
+void od_encode_cdf_adapt(od_ec_enc *ec, int val, uint16_t *cdf, int n,
+ int increment);
+int od_decode_cdf_adapt_(od_ec_dec *ec, uint16_t *cdf, int n,
+ int increment OD_ACC_STR);
+void generic_encode(od_ec_enc *enc, generic_encoder *model, int x, int max,
+ int *ex_q16, int integration);
+double generic_encode_cost(generic_encoder *model, int x, int max,
+ int *ex_q16);
+double od_encode_cdf_cost(int val, uint16_t *cdf, int n);
+int od_decode_cdf_adapt_q15_(od_ec_dec *ec, uint16_t *cdf, int n,
+ int *count, int rate OD_ACC_STR);
+int generic_decode_(od_ec_dec *dec, generic_encoder *model, int max,
+ int *ex_q16, int integration OD_ACC_STR);
+int log_ex(int ex_q16);
+void generic_model_update(generic_encoder *model, int *ex_q16, int x, int xs,
+ int id, int integration);
diff --git a/av1/common/laplace_tables.c b/av1/common/laplace_tables.c
new file mode 100644
index 0000000..f1c3f9a
--- /dev/null
+++ b/av1/common/laplace_tables.c
@@ -0,0 +1,272 @@
+/* This file is auto-generated using "gen_laplace_tables 128 7" */
+/* clang-format off */
+# include "config.h"
+#include "pvq.h"
+const uint16_t EXP_CDF_TABLE[128][16] = {
+  {32753,32754,32755,32756,32757,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {32499,32753,32755,32756,32757,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {32243,32747,32755,32756,32757,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {31987,32737,32755,32756,32757,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {31732,32724,32755,32756,32757,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {31476,32706,32754,32756,32757,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {31220,32684,32753,32756,32757,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {30964,32658,32751,32756,32757,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {30708,32628,32748,32756,32757,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {30452,32594,32745,32756,32757,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {30198,32558,32742,32756,32757,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {29941,32515,32736,32755,32757,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {29686,32470,32731,32755,32757,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {29429,32419,32723,32754,32757,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {29174,32366,32715,32753,32757,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {28918,32308,32705,32752,32757,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {28662,32246,32694,32750,32757,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {28406,32180,32681,32748,32757,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {28150,32110,32667,32745,32756,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {27894,32036,32651,32742,32756,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {27639,31959,32634,32739,32755,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {27383,31877,32614,32735,32755,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {27126,31790,32592,32730,32754,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {26871,31701,32569,32725,32753,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {26615,31607,32543,32719,32752,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {26361,31511,32517,32713,32751,32758,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {26104,31408,32485,32704,32748,32757,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {25848,31302,32452,32695,32746,32757,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {25591,31191,32416,32684,32743,32756,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {25336,31078,32379,32674,32741,32756,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {25080,30960,32338,32661,32737,32755,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {24824,30838,32295,32648,32733,32754,32759,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {24568,30712,32248,32632,32728,32752,32758,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {24313,30583,32199,32616,32723,32751,32758,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {24057,30449,32147,32598,32718,32750,32758,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {23801,30311,32091,32578,32711,32747,32757,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {23546,30170,32033,32557,32704,32745,32757,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {23288,30022,31969,32532,32695,32742,32756,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {23033,29873,31904,32507,32686,32739,32755,32760,32761,32762,32763,32764,32765,32766,32767,32768},
+  {22778,29720,31835,32479,32675,32735,32753,32759,32761,32762,32763,32764,32765,32766,32767,32768},
+  {22521,29561,31761,32449,32664,32731,32752,32759,32761,32762,32763,32764,32765,32766,32767,32768},
+  {22267,29401,31686,32418,32652,32727,32751,32759,32761,32762,32763,32764,32765,32766,32767,32768},
+  {22011,29235,31605,32383,32638,32722,32749,32758,32761,32762,32763,32764,32765,32766,32767,32768},
+  {21754,29064,31520,32345,32622,32715,32746,32757,32761,32762,32763,32764,32765,32766,32767,32768},
+  {21501,28893,31434,32307,32607,32710,32745,32757,32761,32762,32763,32764,32765,32766,32767,32768},
+  {21243,28713,31339,32262,32587,32701,32741,32755,32760,32762,32763,32764,32765,32766,32767,32768},
+  {20988,28532,31243,32217,32567,32693,32738,32754,32760,32762,32763,32764,32765,32766,32767,32768},
+  {20730,28344,31140,32167,32544,32682,32733,32752,32759,32762,32763,32764,32765,32766,32767,32768},
+  {20476,28156,31036,32116,32521,32673,32730,32751,32759,32762,32763,32764,32765,32766,32767,32768},
+  {20220,27962,30926,32061,32495,32661,32725,32749,32758,32762,32763,32764,32765,32766,32767,32768},
+  {19963,27763,30810,32000,32465,32647,32718,32746,32757,32761,32763,32764,32765,32766,32767,32768},
+  {19708,27562,30691,31938,32435,32633,32712,32743,32756,32761,32763,32764,32765,32766,32767,32768},
+  {19454,27358,30569,31873,32403,32618,32705,32741,32755,32761,32763,32764,32765,32766,32767,32768},
+  {19196,27146,30438,31801,32365,32599,32696,32736,32753,32760,32763,32764,32765,32766,32767,32768},
+  {18942,26934,30306,31728,32328,32581,32688,32733,32752,32760,32763,32764,32765,32766,32767,32768},
+  {18684,26714,30164,31647,32284,32558,32676,32727,32749,32758,32762,32764,32765,32766,32767,32768},
+  {18429,26493,30021,31565,32240,32535,32664,32721,32746,32757,32762,32764,32765,32766,32767,32768},
+  {18174,26268,29872,31477,32192,32510,32652,32715,32743,32756,32762,32764,32765,32766,32767,32768},
+  {17920,26040,29719,31386,32141,32483,32638,32708,32740,32754,32761,32764,32765,32766,32767,32768},
+  {17661,25803,29556,31286,32083,32451,32620,32698,32734,32751,32759,32763,32765,32766,32767,32768},
+  {17406,25566,29391,31184,32024,32418,32603,32690,32731,32750,32759,32763,32765,32766,32767,32768},
+  {17151,25325,29220,31076,31961,32383,32584,32680,32726,32748,32758,32763,32765,32766,32767,32768},
+  {16896,25080,29044,30964,31894,32344,32562,32668,32719,32744,32756,32762,32765,32766,32767,32768},
+  {16639,24829,28860,30844,31821,32302,32539,32655,32712,32740,32754,32761,32764,32766,32767,32768},
+  {16384,24576,28672,30720,31744,32256,32512,32640,32704,32736,32752,32760,32764,32766,32767,32768},
+  {16130,24320,28479,30591,31663,32208,32485,32625,32696,32732,32750,32759,32764,32766,32767,32768},
+  {15872,24056,28276,30452,31574,32152,32450,32604,32683,32724,32745,32756,32762,32765,32766,32768},
+  {15615,23789,28068,30308,31480,32094,32415,32583,32671,32717,32741,32754,32761,32764,32766,32768},
+  {15361,23521,27856,30159,31382,32032,32377,32560,32657,32709,32737,32752,32760,32764,32766,32768},
+  {15103,23245,27634,30000,31275,31963,32334,32534,32642,32700,32731,32748,32757,32762,32765,32768},
+  {14848,22968,27409,29837,31165,31891,32288,32505,32624,32689,32725,32744,32755,32761,32764,32768},
+  {14592,22686,27176,29666,31047,31813,32238,32474,32605,32678,32718,32740,32752,32759,32763,32768},
+  {14336,22400,26936,29488,30923,31730,32184,32439,32583,32664,32709,32735,32749,32757,32762,32768},
+  {14079,22109,26689,29301,30791,31641,32125,32401,32559,32649,32700,32729,32746,32756,32761,32768},
+  {13825,21817,26437,29108,30652,31545,32061,32359,32532,32632,32690,32723,32742,32753,32759,32768},
+  {13568,21518,26176,28905,30504,31441,31990,32312,32501,32611,32676,32714,32736,32749,32757,32768},
+  {13314,21218,25911,28697,30351,31333,31916,32262,32468,32590,32662,32705,32731,32746,32755,32768},
+  {13054,20908,25633,28475,30185,31214,31833,32205,32429,32564,32645,32694,32723,32741,32752,32768},
+  {12803,20603,25356,28252,30017,31093,31748,32147,32390,32538,32628,32683,32717,32737,32749,32768},
+  {12544,20286,25064,28013,29833,30956,31649,32077,32341,32504,32605,32667,32705,32729,32744,32768},
+  {12288,19968,24768,27768,29643,30815,31547,32005,32291,32470,32582,32652,32696,32723,32740,32768},
+  {12033,19647,24465,27514,29443,30664,31437,31926,32235,32431,32555,32633,32683,32714,32734,32768},
+  {11777,19321,24154,27250,29233,30504,31318,31839,32173,32387,32524,32612,32668,32704,32727,32768},
+  {11521,18991,23835,26976,29013,30334,31190,31745,32105,32338,32489,32587,32651,32692,32719,32768},
+  {11265,18657,23508,26691,28780,30151,31051,31641,32028,32282,32449,32559,32631,32678,32709,32768},
+  {11006,18316,23170,26394,28535,29957,30901,31528,31944,32220,32404,32526,32607,32661,32697,32768},
+  {10752,17976,22830,26091,28282,29754,30743,31408,31854,32154,32356,32491,32582,32643,32684,32768},
+  {10496,17630,22479,25775,28015,29538,30573,31276,31754,32079,32300,32450,32552,32621,32668,32768},
+  {10240,17280,22120,25448,27736,29309,30390,31133,31644,31995,32237,32403,32517,32595,32649,32768},
+  { 9984,16926,21753,25109,27443,29066,30194,30978,31523,31902,32166,32349,32476,32565,32627,32768},
+  { 9728,16568,21377,24759,27137,28809,29984,30811,31392,31801,32088,32290,32432,32532,32602,32768},
+  { 9474,16208,20995,24399,26819,28539,29762,30631,31249,31688,32000,32222,32380,32492,32572,32768},
+  { 9216,15840,20601,24023,26483,28251,29522,30435,31091,31563,31902,32146,32321,32447,32537,32768},
+  { 8959,15469,20199,23636,26133,27947,29265,30223,30919,31425,31792,32059,32253,32394,32496,32768},
+  { 8705,15097,19791,23238,25770,27629,28994,29997,30733,31274,31671,31963,32177,32334,32449,32768},
+  { 8449,14719,19373,22827,25390,27292,28704,29752,30530,31107,31535,31853,32089,32264,32394,32768},
+  { 8192,14336,18944,22400,24992,26936,28394,29488,30308,30923,31384,31730,31989,32184,32330,32768},
+  { 7936,13950,18507,21961,24578,26561,28064,29203,30066,30720,31216,31592,31877,32093,32256,32768},
+  { 7678,13558,18060,21507,24146,26166,27713,28897,29804,30498,31030,31437,31749,31988,32171,32768},
+  { 7423,13165,17606,21041,23698,25753,27342,28571,29522,30257,30826,31266,31606,31869,32073,32768},
+  { 7168,12768,17143,20561,23231,25317,26947,28220,29215,29992,30599,31073,31444,31734,31960,32768},
+  { 6911,12365,16669,20065,22744,24858,26526,27842,28881,29701,30348,30858,31261,31579,31830,32768},
+  { 6657,11961,16188,19556,22240,24379,26083,27441,28523,29385,30072,30620,31056,31404,31681,32768},
+  { 6400,11550,15694,19029,21712,23871,25609,27007,28132,29037,29766,30352,30824,31204,31509,32768},
+  { 6142,11134,15190,18486,21164,23340,25108,26544,27711,28659,29429,30055,30564,30977,31313,32768},
+  { 5890,10720,14682,17932,20598,22785,24579,26051,27258,28248,29060,29726,30273,30721,31089,32768},
+  { 5631,10295,14157,17356,20005,22199,24016,25520,26766,27798,28652,29359,29945,30430,30832,32768},
+  { 5377, 9871,13628,16768,19393,21587,23421,24954,26236,27308,28204,28953,29579,30102,30539,32768},
+  { 5121, 9441,13086,16161,18756,20945,22792,24351,25666,26776,27712,28502,29169,29731,30206,32768},
+  { 4865, 9007,12534,15538,18096,20274,22129,23708,25053,26198,27173,28004,28711,29313,29826,32768},
+  { 4608, 8568,11971,14896,17409,19569,21425,23020,24391,25569,26581,27451,28199,28842,29394,32768},
+  { 4351, 8125,11398,14236,16697,18831,20682,22287,23679,24886,25933,26841,27628,28311,28903,32768},
+  { 4096, 7680,10816,13560,15961,18062,19900,21508,22915,24146,25224,26167,26992,27714,28346,32768},
+  { 3840, 7230,10223,12865,15197,17256,19074,20679,22096,23347,24451,25426,26287,27047,27718,32768},
+  { 3584, 6776, 9619,12151,14406,16414,18203,19796,21215,22479,23604,24606,25499,26294,27002,32768},
+  { 3328, 6318, 9004,11417,13585,15533,17283,18856,20269,21538,22678,23703,24624,25451,26194,32768},
+  { 3072, 5856, 8379,10665,12737,14615,16317,17859,19257,20524,21672,22712,23655,24509,25283,32768},
+  { 2816, 5390, 7743, 9894,11860,13657,15299,16800,18172,19426,20573,21621,22579,23455,24255,32768},
+  { 2560, 4920, 7096, 9102,10951,12656,14227,15676,17011,18242,19377,20423,21388,22277,23097,32768},
+  { 2304, 4446, 6437, 8288,10009,11609,13097,14480,15766,16961,18072,19105,20066,20959,21789,32768},
+  { 2048, 3968, 5768, 7456, 9038,10521,11911,13215,14437,15583,16657,17664,18608,19493,20323,32768},
+  { 1792, 3486, 5087, 6601, 8032, 9385,10664,11873,13016,14096,15117,16082,16995,17858,18673,32768},
+  { 1536, 3000, 4395, 5725, 6993, 8201, 9353,10451,11497,12494,13444,14350,15213,16036,16820,32768},
+  { 1280, 2510, 3692, 4828, 5919, 6968, 7976, 8944, 9875,10769,11628,12454,13248,14011,14744,32768},
+  { 1024, 2016, 2977, 3908, 4810, 5684, 6530, 7350, 8144, 8913, 9658,10380,11080,11758,12415,32768},
+  {  768, 1518, 2250, 2965, 3663, 4345, 5011, 5662, 6297, 6917, 7523, 8115, 8693, 9257, 9808,32768},
+  {  512, 1016, 1512, 2000, 2481, 2954, 3420, 3879, 4330, 4774, 5211, 5642, 6066, 6483, 6894,32768},
+  {  256,  510,  762, 1012, 1260, 1506, 1750, 1992, 2232, 2471, 2708, 2943, 3176, 3407, 3636,32768},
+const uint16_t LAPLACE_OFFSET[128] = {
+  0,
+  29871,
+  28672,
+  27751,
+  26975,
+  26291,
+  25673,
+  25105,
+  24576,
+  24079,
+  23609,
+  23162,
+  22734,
+  22325,
+  21931,
+  21550,
+  21182,
+  20826,
+  20480,
+  20143,
+  19815,
+  19495,
+  19183,
+  18877,
+  18579,
+  18286,
+  17999,
+  17718,
+  17442,
+  17170,
+  16904,
+  16642,
+  16384,
+  16129,
+  15879,
+  15633,
+  15390,
+  15150,
+  14913,
+  14680,
+  14450,
+  14222,
+  13997,
+  13775,
+  13556,
+  13338,
+  13124,
+  12911,
+  12701,
+  12493,
+  12288,
+  12084,
+  11882,
+  11682,
+  11484,
+  11288,
+  11094,
+  10901,
+  10710,
+  10521,
+  10333,
+  10147,
+  9962,
+  9779,
+  9597,
+  9417,
+  9238,
+  9060,
+  8884,
+  8709,
+  8535,
+  8363,
+  8192,
+  8021,
+  7853,
+  7685,
+  7518,
+  7352,
+  7188,
+  7025,
+  6862,
+  6701,
+  6540,
+  6381,
+  6222,
+  6065,
+  5908,
+  5753,
+  5598,
+  5444,
+  5291,
+  5138,
+  4987,
+  4837,
+  4687,
+  4538,
+  4390,
+  4242,
+  4096,
+  3950,
+  3804,
+  3660,
+  3516,
+  3373,
+  3231,
+  3089,
+  2948,
+  2808,
+  2668,
+  2529,
+  2391,
+  2253,
+  2116,
+  1979,
+  1843,
+  1708,
+  1573,
+  1439,
+  1306,
+  1172,
+  1040,
+  908,
+  777,
+  646,
+  516,
+  386,
+  257,
+  128,
diff --git a/av1/common/odintrin.c b/av1/common/odintrin.c
index bb36104..868efac 100644
--- a/av1/common/odintrin.c
+++ b/av1/common/odintrin.c
@@ -1,5 +1,5 @@
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -8,8 +8,21 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at
+/* clang-format off */
 #include "av1/common/odintrin.h"
+# include <stdio.h>
+void od_fatal_impl(const char *_str, const char *_file, int _line) {
+  fprintf(stderr, "Fatal (internal) error in %s, line %d: %s\n",
+   _file, _line, _str);
+  abort();
 /*Constants for use with OD_DIVU_SMALL().
   See \cite{Rob05} for details on computing these constants.
diff --git a/av1/common/odintrin.h b/av1/common/odintrin.h
index f9049c7..64cadd6 100644
--- a/av1/common/odintrin.h
+++ b/av1/common/odintrin.h
@@ -1,5 +1,5 @@
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -8,9 +8,16 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at
+/* clang-format off */
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/bitops.h"
@@ -20,15 +27,48 @@
 extern "C" {
+# if !defined(M_LOG2E)
+#  define M_LOG2E (1.4426950408889634073599246810019)
+# endif
+# if !defined(M_LN2)
+#  define M_LN2 (0.69314718055994530941723212145818)
+# endif
 /*Smallest blocks are 4x4*/
 #define OD_LOG_BSIZE0 (2)
-/*There are 5 block sizes total (4x4, 8x8, 16x16, 32x32 and 64x64).*/
-#define OD_NBSIZES (5)
+/*There are 4 block sizes total (4x4, 8x8, 16x16 and 32x32).*/
+#define OD_NBSIZES (4)
 /*The log of the maximum length of the side of a block.*/
 /*The maximum length of the side of a block.*/
+/**The maximum number of color planes allowed in a single frame.*/
+# define OD_NPLANES_MAX (3)
+# define OD_COEFF_SHIFT (4)
+# define OD_DISABLE_CFL (1)
+# define OD_DISABLE_FILTER (1)
+# define OD_LOG(a)
+# define OD_LOG_PARTIAL(a)
+/*Possible block sizes, note that OD_BLOCK_NXN = log2(N) - 2.*/
+#define OD_BLOCK_4X4 (0)
+#define OD_BLOCK_8X8 (1)
+#define OD_BLOCK_16X16 (2)
+#define OD_BLOCK_32X32 (3)
+#define OD_BLOCK_SIZES (OD_BLOCK_32X32 + 1)
+# define OD_ROBUST_STREAM (1)
 typedef int od_coeff;
 #define OD_DIVU_DMAX (1024)
@@ -58,7 +98,8 @@
   We define a special version of the macro to use when x can be zero.*/
 #define OD_ILOG(x) ((x) ? OD_ILOG_NZ(x) : 0)
-#define OD_LOG2 AOMLOG2
+#define OD_LOG2(x) (M_LOG2E*log(x))
+#define OD_EXP2(x) (exp(M_LN2*(x)))
 /*Enable special features for gcc and compatible compilers.*/
 #if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
@@ -121,10 +162,92 @@
 /** Copy n elements of memory from src to dst, allowing overlapping regions.
     The 0* term provides compile-time type checking */
 #if !defined(OVERRIDE_OD_MOVE)
-#define OD_MOVE(dst, src, n) \
-  (memmove((dst), (src), sizeof(*(dst)) * (n) + 0 * ((dst) - (src))))
+# define OD_MOVE(dst, src, n) \
+ (memmove((dst), (src), sizeof(*(dst))*(n) + 0*((dst) - (src)) ))
+/** Linkage will break without this if using a C++ compiler, and will issue
+ * warnings without this for a C compiler*/
+#if defined(__cplusplus)
+# define OD_EXTERN extern
+# define OD_EXTERN
+/** Set n elements of dst to zero */
+#if !defined(OVERRIDE_OD_CLEAR)
+# define OD_CLEAR(dst, n) (memset((dst), 0, sizeof(*(dst))*(n)))
+/** Silence unused parameter/variable warnings */
+# define OD_UNUSED(expr) (void)(expr)
+#if defined(OD_FLOAT_PVQ)
+typedef double od_val16;
+typedef double od_val32;
+# define OD_QCONST32(x, bits) (x)
+# define OD_ROUND16(x) (x)
+# define OD_ROUND32(x) (x)
+# define OD_SHL(x, shift) (x)
+# define OD_SHR(x, shift) (x)
+# define OD_SHR_ROUND(x, shift) (x)
+# define OD_ABS(x) (fabs(x))
+# define OD_MULT16_16(a, b) ((a)*(b))
+# define OD_MULT16_32_Q16(a, b) ((a)*(b))
+typedef int16_t od_val16;
+typedef int32_t od_val32;
+/** Compile-time conversion of float constant to 32-bit value */
+# define OD_QCONST32(x, bits) ((od_val32)(.5 + (x)*(((od_val32)1) << (bits))))
+# define OD_ROUND16(x) (int16_t)(floor(.5 + (x)))
+# define OD_ROUND32(x) (int32_t)(floor(.5 + (x)))
+/*Shift x left by shift*/
+# define OD_SHL(a, shift) ((int32_t)((uint32_t)(a) << (shift)))
+/*Shift x right by shift (without rounding)*/
+# define OD_SHR(x, shift) \
+  ((int32_t)((x) >> (shift)))
+/*Shift x right by shift (with rounding)*/
+# define OD_SHR_ROUND(x, shift) \
+  ((int32_t)(((x) + (1 << (shift) >> 1)) >> (shift)))
+/*Shift x right by shift (without rounding) or left by -shift if shift
+  is negative.*/
+# define OD_VSHR(x, shift) \
+  (((shift) > 0) ? OD_SHR(x, shift) : OD_SHL(x, -(shift)))
+/*Shift x right by shift (with rounding) or left by -shift if shift
+  is negative.*/
+# define OD_VSHR_ROUND(x, shift) \
+  (((shift) > 0) ? OD_SHR_ROUND(x, shift) : OD_SHL(x, -(shift)))
+# define OD_ABS(x) (abs(x))
+/* (od_val32)(od_val16) gives TI compiler a hint that it's 16x16->32 multiply */
+/** 16x16 multiplication where the result fits in 32 bits */
+# define OD_MULT16_16(a, b) \
+ (((od_val32)(od_val16)(a))*((od_val32)(od_val16)(b)))
+/* Multiplies 16-bit a by 32-bit b and keeps bits [16:47]. */
+# define OD_MULT16_32_Q16(a, b) ((int16_t)(a)*(int64_t)(int32_t)(b) >> 16)
+/*16x16 multiplication where the result fits in 16 bits, without rounding.*/
+# define OD_MULT16_16_Q15(a, b) \
+  (((int16_t)(a)*((int32_t)(int16_t)(b))) >> 15)
+/*16x16 multiplication where the result fits in 16 bits, without rounding.*/
+# define OD_MULT16_16_Q16(a, b) \
+  ((((int16_t)(a))*((int32_t)(int16_t)(b))) >> 16)
+/*All of these macros should expect floats as arguments.*/
+/*These two should compile as a single SSE instruction.*/
+# define OD_MINF(a, b) ((a) < (b) ? (a) : (b))
+# define OD_MAXF(a, b) ((a) > (b) ? (a) : (b))
+# define OD_DIV_R0(x, y) (((x) + OD_FLIPSIGNI((((y) + 1) >> 1) - 1, (x)))/(y))
+# define OD_SIGNMASK(a) (-((a) < 0))
+# define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b))
+# define OD_MULT16_16_Q15(a, b) \
+  (((int16_t)(a)*((int32_t)(int16_t)(b))) >> 15)
+/* Multiplies 16-bit a by 32-bit b and keeps bits [16:47]. */
+# define OD_MULT16_32_Q16(a, b) ((int16_t)(a)*(int64_t)(int32_t)(b) >> 16)
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index 0d42119..b4e5166 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h
@@ -23,6 +23,10 @@
 #include "av1/common/frame_buffers.h"
 #include "av1/common/loopfilter.h"
 #include "av1/common/tile_common.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/pvq.h"
 #ifdef __cplusplus
 extern "C" {
@@ -437,11 +441,17 @@
 static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        tran_low_t *pvq_ref_coeff,
                                         tran_low_t *dqcoeff) {
   int i;
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     xd->plane[i].dqcoeff = dqcoeff;
+    xd->plane[i].pvq_ref_coeff = pvq_ref_coeff;
     xd->above_context[i] =
         cm->above_context +
         i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
diff --git a/av1/common/partition.c b/av1/common/partition.c
new file mode 100644
index 0000000..63d9d69
--- /dev/null
+++ b/av1/common/partition.c
@@ -0,0 +1,256 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+/* clang-format off */
+# include "config.h"
+#include "enums.h"
+#include "odintrin.h"
+#include "partition.h"
+#include "zigzag.h"
+OD_EXTERN const index_pair *OD_ZIGZAG4[4] = {
+OD_EXTERN const index_pair *OD_ZIGZAG8[4] = {
+OD_EXTERN const index_pair *OD_ZIGZAG16[4] = {
+OD_EXTERN const index_pair *OD_ZIGZAG32[4] = {
+/* The tables below specify how coefficient blocks are translated to
+   and from PVQ partition coding scan order for 4x4, 8x8 and 16x16 */
+static const int OD_LAYOUT32_OFFSETS[4] = { 0, 128, 256, 768 };
+const band_layout OD_LAYOUT32 = {
+  32,
+  3,
+static const int OD_LAYOUT16_OFFSETS[4] = { 0, 32, 64, 192 };
+const band_layout OD_LAYOUT16 = {
+  16,
+  3,
+const int OD_LAYOUT8_OFFSETS[4] = { 0, 8, 16, 48 };
+const band_layout OD_LAYOUT8 = {
+  8,
+  3,
+static const int OD_LAYOUT4_OFFSETS[2] = { 0, 15 };
+const band_layout OD_LAYOUT4 = {
+  4,
+  1,
+/* First element is the number of bands, followed by the list all the band
+  boundaries. */
+static const int OD_BAND_OFFSETS4[] = {1, 1, 16};
+static const int OD_BAND_OFFSETS8[] = {4, 1, 16, 24, 32, 64};
+static const int OD_BAND_OFFSETS16[] = {7, 1, 16, 24, 32, 64, 96, 128, 256};
+static const int OD_BAND_OFFSETS32[] = {10, 1, 16, 24, 32, 64, 96, 128, 256,
+ 384, 512, 1024};
+static const int OD_BAND_OFFSETS64[] = {13, 1, 16, 24, 32, 64, 96, 128, 256,
+ 384, 512, 1024, 1536, 2048, 4096};
+const int *const OD_BAND_OFFSETS[OD_NBSIZES + 1] = {
+/** Perform a single stage of conversion from a coefficient block in
+ * raster order into coding scan order
+ *
+ * @param [in]     layout  scan order specification
+ * @param [out]    dst     destination vector
+ * @param [in]     src     source coefficient block
+ * @param [int]    int     source vector row stride
+ */
+static void od_band_from_raster(const band_layout *layout, int16_t *dst,
+ const int16_t *src, int stride, TX_TYPE tx_type) {
+  int i;
+  int len;
+  len = layout->band_offsets[layout->nb_bands];
+  for (i = 0; i < len; i++) {
+    dst[i] = src[layout->dst_table[tx_type][i][1]*stride + layout->dst_table[tx_type][i][0]];
+  }
+/** Perform a single stage of conversion from a vector in coding scan
+    order back into a coefficient block in raster order
+ *
+ * @param [in]     layout  scan order specification
+ * @param [out]    dst     destination coefficient block
+ * @param [in]     src     source vector
+ * @param [int]    stride  destination vector row stride
+ */
+static void od_raster_from_band(const band_layout *layout, int16_t *dst,
+ int stride, TX_TYPE tx_type, const int16_t *src) {
+  int i;
+  int len;
+  len = layout->band_offsets[layout->nb_bands];
+  for (i = 0; i < len; i++) {
+    dst[layout->dst_table[tx_type][i][1]*stride + layout->dst_table[tx_type][i][0]] = src[i];
+  }
+static const band_layout *const OD_LAYOUTS[] = {&OD_LAYOUT4, &OD_LAYOUT8,
+/** Converts a coefficient block in raster order into a vector in
+ * coding scan order with the PVQ partitions laid out one after
+ * another.  This works in stages; the 4x4 conversion is applied to
+ * the coefficients nearest DC, then the 8x8 applied to the 8x8 block
+ * nearest DC that was not already coded by 4x4, then 16x16 following
+ * the same pattern.
+ *
+ * @param [out]    dst        destination vector
+ * @param [in]     n          block size (along one side)
+ * @param [in]     ty_type    transfrom type
+ * @param [in]     src        source coefficient block
+ * @param [in]     stride     source vector row stride
+ */
+void od_raster_to_coding_order(int16_t *dst, int n, TX_TYPE ty_type,
+ const int16_t *src, int stride) {
+  int bs;
+  /* dst + 1 because DC is not included for 4x4 blocks. */
+  od_band_from_raster(OD_LAYOUTS[0], dst + 1, src, stride, ty_type);
+  for (bs = 1; bs < OD_NBSIZES; bs++) {
+    int size;
+    int offset;
+    /* Length of block size > 4. */
+    size = 1 << (OD_LOG_BSIZE0 + bs);
+    /* Offset is the size of the previous block squared. */
+    offset = 1 << 2*(OD_LOG_BSIZE0 - 1 + bs);
+    if (n >= size) {
+      /* 3 16x16 bands come after 3 8x8 bands, which come after 2 4x4 bands. */
+      od_band_from_raster(OD_LAYOUTS[bs], dst + offset, src, stride, ty_type);
+    }
+  }
+  dst[0] = src[0];
+/** Converts a vector in coding scan order witht he PVQ partitions
+ * laid out one after another into a coefficient block in raster
+ * order. This works in stages in the reverse order of raster->scan
+ * order; the 16x16 conversion is applied to the coefficients that
+ * don't appear in an 8x8 block, then the 8x8 applied to the 8x8 block
+ * sans the 4x4 block it contains, then 4x4 is converted sans DC.
+ *
+ * @param [out]    dst        destination coefficient block
+ * @param [in]     stride     destination vector row stride
+ * @param [in]     src        source vector
+ * @param [in]     n          block size (along one side)
+ */
+void od_coding_order_to_raster(int16_t *dst, int stride, TX_TYPE ty_type,
+ const int16_t *src, int n) {
+  int bs;
+  /* src + 1 because DC is not included for 4x4 blocks. */
+  od_raster_from_band(OD_LAYOUTS[0], dst, stride, ty_type, src + 1);
+  for (bs = 1; bs < OD_NBSIZES; bs++) {
+    int size;
+    int offset;
+    /* Length of block size > 4 */
+    size = 1 << (OD_LOG_BSIZE0 + bs);
+    /* Offset is the size of the previous block squared. */
+    offset = 1 << 2*(OD_LOG_BSIZE0 - 1 + bs);
+    if (n >= size) {
+      /* 3 16x16 bands come after 3 8x8 bands, which come after 2 4x4 bands. */
+      od_raster_from_band(OD_LAYOUTS[bs], dst, stride, ty_type, src + offset);
+    }
+  }
+  dst[0] = src[0];
+/** Perform a single stage of conversion from a coefficient block in
+ * raster order into coding scan order
+ *
+ * @param [in]     layout  scan order specification
+ * @param [out]    dst     destination vector
+ * @param [in]     src     source coefficient block
+ * @param [int]    int     source vector row stride
+ */
+static void od_band_from_raster_16(const band_layout *layout, int16_t *dst,
+ const int16_t *src, int stride) {
+  int i;
+  int len;
+  len = layout->band_offsets[layout->nb_bands];
+  for (i = 0; i < len; i++) {
+    dst[i] = src[layout->dst_table[DCT_DCT][i][1]*stride + layout->dst_table[DCT_DCT][i][0]];
+  }
+/** Converts a coefficient block in raster order into a vector in
+ * coding scan order with the PVQ partitions laid out one after
+ * another.  This works in stages; the 4x4 conversion is applied to
+ * the coefficients nearest DC, then the 8x8 applied to the 8x8 block
+ * nearest DC that was not already coded by 4x4, then 16x16 following
+ * the same pattern.
+ *
+ * @param [out]    dst        destination vector
+ * @param [in]     n          block size (along one side)
+ * @param [in]     src        source coefficient block
+ * @param [in]     stride     source vector row stride
+ */
+void od_raster_to_coding_order_16(int16_t *dst, int n, const int16_t *src,
+ int stride) {
+  int bs;
+  /* dst + 1 because DC is not included for 4x4 blocks. */
+  od_band_from_raster_16(OD_LAYOUTS[0], dst + 1, src, stride);
+  for (bs = 1; bs < OD_NBSIZES; bs++) {
+    int size;
+    int offset;
+    /* Length of block size > 4. */
+    size = 1 << (OD_LOG_BSIZE0 + bs);
+    /* Offset is the size of the previous block squared. */
+    offset = 1 << 2*(OD_LOG_BSIZE0 - 1 + bs);
+    if (n >= size) {
+      /* 3 16x16 bands come after 3 8x8 bands, which come after 2 4x4 bands. */
+      od_band_from_raster_16(OD_LAYOUTS[bs], dst + offset, src, stride);
+    }
+  }
+  dst[0] = src[0];
diff --git a/av1/common/partition.h b/av1/common/partition.h
new file mode 100644
index 0000000..c86cb81
--- /dev/null
+++ b/av1/common/partition.h
@@ -0,0 +1,40 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+/* clang-format off */
+#if !defined(_partition_H)
+# define _partition_H
+#include "av1/common/enums.h"
+#include "odintrin.h"
+typedef unsigned char index_pair[2];
+typedef struct {
+  const index_pair **const dst_table;
+  int size;
+  int nb_bands;
+  const int *const band_offsets;
+} band_layout;
+extern const int *const OD_BAND_OFFSETS[OD_NBSIZES + 1];
+void od_raster_to_coding_order(int16_t *dst, int n,  TX_TYPE ty_type,
+ const int16_t *src, int stride);
+void od_coding_order_to_raster(int16_t *dst, int stride,  TX_TYPE ty_type,
+ const int16_t *src, int n);
+void od_raster_to_coding_order_16(int16_t *dst, int n, const int16_t *src,
+ int stride);
diff --git a/av1/common/pvq.c b/av1/common/pvq.c
new file mode 100644
index 0000000..62f3632
--- /dev/null
+++ b/av1/common/pvq.c
@@ -0,0 +1,954 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+/* clang-format off */
+# include "config.h"
+#include "odintrin.h"
+#include "partition.h"
+#include "pvq.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+/* Quantization matrices for 8x8. For other block sizes, we currently just do
+   resampling. */
+/* Flat quantization, i.e. optimize for PSNR. */
+const int OD_QM8_Q4_FLAT[] = {
+  16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16
+# if 0
+/* M1: MPEG2 matrix for inter (which has a dead zone). */
+const int OD_QM8_Q4[] = {
+  16, 17, 18, 19, 20, 21, 22, 23,
+  17, 18, 19, 20, 21, 22, 23, 24,
+  18, 19, 20, 21, 22, 23, 24, 25,
+  19, 20, 21, 22, 23, 24, 26, 27,
+  20, 21, 22, 23, 25, 26, 27, 28,
+  21, 22, 23, 24, 26, 27, 28, 30,
+  22, 23, 24, 26, 27, 28, 30, 31,
+  23, 24, 25, 27, 28, 30, 31, 33};
+# endif
+# if 0
+/* M2: MPEG2 matrix for intra (no dead zone). */
+const int OD_QM8_Q4[] = {
+  16, 16, 19, 22, 22, 26, 26, 27,
+  16, 16, 22, 22, 26, 27, 27, 29,
+  19, 22, 26, 26, 27, 29, 29, 35,
+  22, 24, 27, 27, 29, 32, 34, 38,
+  26, 27, 29, 29, 32, 35, 38, 46,
+  27, 29, 34, 34, 35, 40, 46, 56,
+  29, 34, 34, 37, 40, 48, 56, 69,
+  34, 37, 38, 40, 48, 58, 69, 83
+# endif
+# if 0
+/* M3: Taken from dump_psnrhvs. */
+const int OD_QM8_Q4[] = {
+  16, 16, 17, 20, 24, 29, 36, 42,
+  16, 17, 17, 19, 22, 26, 31, 37,
+  17, 17, 21, 23, 26, 30, 34, 40,
+  20, 19, 23, 28, 31, 35, 39, 45,
+  24, 22, 26, 31, 36, 41, 46, 51,
+  29, 26, 30, 35, 41, 47, 52, 58,
+  36, 31, 34, 39, 46, 52, 59, 66,
+  42, 37, 40, 45, 51, 58, 66, 73
+# endif
+# if 1
+/* M4: a compromise equal to .5*(M3 + .5*(M2+transpose(M2))) */
+const int OD_QM8_Q4_HVS[] = {
+  16, 16, 18, 21, 24, 28, 32, 36,
+  16, 17, 20, 21, 24, 27, 31, 35,
+  18, 20, 24, 25, 27, 31, 33, 38,
+  21, 21, 25, 28, 30, 34, 37, 42,
+  24, 24, 27, 30, 34, 38, 43, 49,
+  28, 27, 31, 34, 38, 44, 50, 58,
+  32, 31, 33, 37, 43, 50, 58, 68,
+  36, 35, 38, 42, 49, 58, 68, 78
+/* Constants for the beta parameter, which controls how activity masking is
+   used.
+   beta = 1 / (1 - alpha), so when beta is 1, alpha is 0 and activity
+   masking is disabled. When beta is 1.5, activity masking is used. Note that
+   activity masking is neither used for 4x4 blocks nor for chroma. */
+static const od_val16 OD_PVQ_BETA4_LUMA[1] = {OD_BETA(1.)};
+static const od_val16 OD_PVQ_BETA8_LUMA[4] = {OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.)};
+static const od_val16 OD_PVQ_BETA16_LUMA[7] = {OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.)};
+static const od_val16 OD_PVQ_BETA32_LUMA[10] = {OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.)};
+static const od_val16 OD_PVQ_BETA64_LUMA[13] = {OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.)};
+static const od_val16 OD_PVQ_BETA4_LUMA_MASKING[1] = {OD_BETA(1.)};
+static const od_val16 OD_PVQ_BETA8_LUMA_MASKING[4] = {OD_BETA(1.5),
+ OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5)};
+static const od_val16 OD_PVQ_BETA16_LUMA_MASKING[7] = {OD_BETA(1.5),
+ OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5),
+ OD_BETA(1.5)};
+static const od_val16 OD_PVQ_BETA32_LUMA_MASKING[10] = {OD_BETA(1.5),
+ OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5),
+ OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5)};
+static const od_val16 OD_PVQ_BETA64_LUMA_MASKING[13] = {OD_BETA(1.5),
+ OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5),
+ OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5), OD_BETA(1.5),
+ OD_BETA(1.5), OD_BETA(1.5)};
+static const od_val16 OD_PVQ_BETA4_CHROMA[1] = {OD_BETA(1.)};
+static const od_val16 OD_PVQ_BETA8_CHROMA[4] = {OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.)};
+static const od_val16 OD_PVQ_BETA16_CHROMA[7] = {OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.)};
+static const od_val16 OD_PVQ_BETA32_CHROMA[10] = {OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.)};
+static const od_val16 OD_PVQ_BETA64_CHROMA[13] = {OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.),
+ OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.), OD_BETA(1.)};
+const od_val16 *const OD_PVQ_BETA[2][OD_NPLANES_MAX][OD_NBSIZES + 1] = {
+void od_adapt_pvq_ctx_reset(od_pvq_adapt_ctx *state, int is_keyframe) {
+  od_pvq_codeword_ctx *ctx;
+  int i;
+  int pli;
+  int bs;
+  ctx = &state->pvq_codeword_ctx;
+  generic_model_init(&state->pvq_param_model[0]);
+  generic_model_init(&state->pvq_param_model[1]);
+  generic_model_init(&state->pvq_param_model[2]);
+  for (i = 0; i < 2*OD_NBSIZES; i++) {
+    ctx->pvq_adapt[4*i + OD_ADAPT_K_Q8] = 384;
+    ctx->pvq_adapt[4*i + OD_ADAPT_SUM_EX_Q8] = 256;
+    ctx->pvq_adapt[4*i + OD_ADAPT_COUNT_Q8] = 104;
+    ctx->pvq_adapt[4*i + OD_ADAPT_COUNT_EX_Q8] = 128;
+  }
+  ctx->pvq_k1_increment = 128;
+  OD_CDFS_INIT(ctx->pvq_k1_cdf, ctx->pvq_k1_increment);
+  for (pli = 0; pli < OD_NPLANES_MAX; pli++) {
+    for (bs = 0; bs < OD_NBSIZES; bs++)
+    for (i = 0; i < PVQ_MAX_PARTITIONS; i++) {
+      state->pvq_exg[pli][bs][i] = 2 << 16;
+    }
+  }
+  for (i = 0; i < OD_NBSIZES*PVQ_MAX_PARTITIONS; i++) {
+    state->pvq_ext[i] = is_keyframe ? 24576 : 2 << 16;
+  }
+  state->pvq_gaintheta_increment = 128;
+  OD_CDFS_INIT(state->pvq_gaintheta_cdf, state->pvq_gaintheta_increment >> 2);
+  state->pvq_skip_dir_increment = 128;
+  OD_CDFS_INIT(state->pvq_skip_dir_cdf, state->pvq_skip_dir_increment >> 2);
+  ctx->pvq_split_increment = 128;
+  OD_CDFS_INIT(ctx->pvq_split_cdf, ctx->pvq_split_increment >> 1);
+/* QMs are arranged from smallest to largest blocksizes, first for
+   blocks with decimation=0, followed by blocks with decimation=1.*/
+int od_qm_offset(int bs, int xydec)
+    return xydec*OD_QM_STRIDE + OD_QM_OFFSET(bs);
+/* Initialize the quantization matrix. */
+// Note: When varying scan orders for hybrid transform is used by PVQ,
+// since AOM does not use magnitude compensation (i.e. simplay x16 for all coeffs),
+// we don't need seperate qm and qm_inv for each transform type.
+void od_init_qm(int16_t *x, int16_t *x_inv, const int *qm) {
+  int i;
+  int j;
+  int16_t y[OD_BSIZE_MAX*OD_BSIZE_MAX];
+  int16_t y_inv[OD_BSIZE_MAX*OD_BSIZE_MAX];
+  int16_t *x1;
+  int16_t *x1_inv;
+  int off;
+  int bs;
+  int xydec;
+  for (bs = 0; bs < OD_NBSIZES; bs++) {
+    for (xydec = 0; xydec < 2; xydec++) {
+      off = od_qm_offset(bs, xydec);
+      x1 = x + off;
+      x1_inv = x_inv + off;
+      for (i = 0; i < 4 << bs; i++) {
+        for (j = 0; j < 4 << bs; j++) {
+          double mag;
+          mag = 1.0;
+          if (i == 0 && j == 0) {
+            mag = 1.0;
+          }
+          else {
+            mag /= 0.0625*qm[(i << 1 >> bs)*8 + (j << 1 >> bs)];
+            OD_ASSERT(mag > 0.0);
+          }
+          /*Convert to fit in 16 bits.*/
+          y[i*(4 << bs) + j] = (int16_t)OD_MINI(OD_QM_SCALE_MAX,
+           (int32_t)floor(.5 + mag*OD_QM_SCALE));
+          y_inv[i*(4 << bs) + j] = (int16_t)floor(.5
+           + OD_QM_SCALE*OD_QM_INV_SCALE/(double)y[i*(4 << bs) + j]);
+        }
+      }
+      od_raster_to_coding_order_16(x1, 4 << bs, y, 4 << bs);
+      od_raster_to_coding_order_16(x1_inv, 4 << bs, y_inv, 4 << bs);
+    }
+  }
+/* Maps each possible size (n) in the split k-tokenizer to a different value.
+   Possible values of n are:
+   2, 3, 4, 7, 8, 14, 15, 16, 31, 32, 63, 64, 127, 128
+   Since we don't care about the order (even in the bit-stream) the simplest
+   ordering (implemented here) is:
+   14, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 */
+int od_pvq_size_ctx(int n) {
+  int logn;
+  int odd;
+  logn = OD_ILOG(n - 1);
+  odd = n & 1;
+  return 2*logn - 1 - odd - 7*(n == 14);
+/* Maps a length n to a context for the (k=1, n<=16) coder, with a special
+   case when n is the original length (orig_length=1) of the vector (i.e. we
+   haven't split it yet). For orig_length=0, we use the same mapping as
+   od_pvq_size_ctx() up to n=16. When orig_length=1, we map lengths
+   7, 8, 14, 15 to contexts 8 to 11. */
+int od_pvq_k1_ctx(int n, int orig_length) {
+  if (orig_length) return 8 + 2*(n > 8) + (n & 1);
+  else return od_pvq_size_ctx(n);
+/* Indexing for the packed quantization matrices. */
+int od_qm_get_index(int bs, int band) {
+  /* The -band/3 term is due to the fact that we force corresponding horizontal
+     and vertical bands to have the same quantization. */
+  OD_ASSERT(bs >= 0 && bs < OD_NBSIZES);
+  return bs*(bs + 1) + band - band/3;
+#if !defined(OD_FLOAT_PVQ)
+/*See celt/mathops.c in Opus and tools/cos_search.c.*/
+static int16_t od_pvq_cos_pi_2(int16_t x)
+  int16_t x2;
+  x2 = OD_MULT16_16_Q15(x, x);
+  return OD_MINI(32767, (1073758164 - x*x + x2*(-7654 + OD_MULT16_16_Q16(x2,
+   16573 + OD_MULT16_16_Q16(-2529, x2)))) >> 15);
+/*Approximates cos(x) for -pi < x < pi.
+  Input is in OD_THETA_SCALE.*/
+od_val16 od_pvq_cos(od_val32 x) {
+#if defined(OD_FLOAT_PVQ)
+  return cos(x);
+  /*Wrap x around by masking, since cos is periodic.*/
+  x = x & 0x0001ffff;
+  if (x > (1 << 16)) {
+    x = (1 << 17) - x;
+  }
+  if (x & 0x00007fff) {
+    if (x < (1 << 15)) {
+       return od_pvq_cos_pi_2((int16_t)x);
+    }
+    else {
+      return -od_pvq_cos_pi_2((int16_t)(65536 - x));
+    }
+  }
+  else {
+    if (x & 0x0000ffff) {
+      return 0;
+    }
+    else if (x & 0x0001ffff) {
+      return -32767;
+    }
+    else {
+      return 32767;
+    }
+  }
+/*Approximates sin(x) for 0 <= x < pi.
+  Input is in OD_THETA_SCALE.*/
+od_val16 od_pvq_sin(od_val32 x) {
+#if defined(OD_FLOAT_PVQ)
+  return sin(x);
+  return od_pvq_cos(32768 - x);
+#if !defined(OD_FLOAT_PVQ)
+/* Computes an upper-bound on the number of bits required to store the L2 norm
+   of a vector (excluding sign). */
+int od_vector_log_mag(const od_coeff *x, int n) {
+  int i;
+  int32_t sum;
+  sum = 0;
+  for (i = 0; i < n; i++) {
+    int16_t tmp;
+    tmp = x[i] >> 8;
+    sum += tmp*(int32_t)tmp;
+  }
+  /* We add one full bit (instead of rounding OD_ILOG() up) for safety because
+     the >> 8 above causes the sum to be slightly underestimated. */
+  return 8 + 1 + OD_ILOG(n + sum)/2;
+/** Computes Householder reflection that aligns the reference r to the
+ *  dimension in r with the greatest absolute value. The reflection
+ *  vector is returned in r.
+ *
+ * @param [in,out]  r      reference vector to be reflected, reflection
+ *                         also returned in r
+ * @param [in]      n      number of dimensions in r
+ * @param [in]      gr     gain of reference vector
+ * @param [out]     sign   sign of reflection
+ * @return                 dimension number to which reflection aligns
+ **/
+int od_compute_householder(od_val16 *r, int n, od_val32 gr, int *sign,
+ int shift) {
+  int m;
+  int i;
+  int s;
+  od_val16 maxr;
+  OD_UNUSED(shift);
+  /* Pick component with largest magnitude. Not strictly
+   * necessary, but it helps numerical stability */
+  m = 0;
+  maxr = 0;
+  for (i = 0; i < n; i++) {
+    if (OD_ABS(r[i]) > maxr) {
+      maxr = OD_ABS(r[i]);
+      m = i;
+    }
+  }
+  s = r[m] > 0 ? 1 : -1;
+  /* This turns r into a Householder reflection vector that would reflect
+   * the original r[] to e_m */
+  r[m] += OD_SHR_ROUND(gr*s, shift);
+  *sign = s;
+  return m;
+#if !defined(OD_FLOAT_PVQ)
+#define OD_RCP_INSHIFT 15
+#define OD_RCP_OUTSHIFT 14
+static od_val16 od_rcp(od_val16 x)
+  int i;
+  od_val16 n;
+  od_val16 r;
+  i = OD_ILOG(x) - 1;
+  /*n is Q15 with range [0,1).*/
+  /*Start with a linear approximation:
+    r = 1.8823529411764706-0.9411764705882353*n.
+    The coefficients and the result are Q14 in the range [15420,30840].*/
+  r = 30840 + OD_MULT16_16_Q15(-15420, n);
+  /*Perform two Newton iterations:
+    r -= r*((r*n)-1.Q15)
+       = r*((r*n)+(r-1.Q15)).*/
+  r = r - OD_MULT16_16_Q15(r, (OD_MULT16_16_Q15(r, n) + r - 32768));
+  /*We subtract an extra 1 in the second iteration to avoid overflow; it also
+     neatly compensates for truncation error in the rest of the process.*/
+  r = r - (1 + OD_MULT16_16_Q15(r, OD_MULT16_16_Q15(r, n) + r - 32768));
+  /*r is now the Q15 solution to 2/(n+1), with a maximum relative error
+     of 7.05346E-5, a (relative) RMSE of 2.14418E-5, and a peak absolute
+     error of 1.24665/32768.*/
+  return OD_VSHR_ROUND(r, i - OD_RCP_OUTSHIFT);
+/** Applies Householder reflection from compute_householder(). The
+ * reflection is its own inverse.
+ *
+ * @param [out]     out    reflected vector
+ * @param [in]      x      vector to be reflected
+ * @param [in]      r      reflection
+ * @param [in]      n      number of dimensions in x,r
+ */
+void od_apply_householder(od_val16 *out, const od_val16 *x, const od_val16 *r,
+ int n) {
+  int i;
+  od_val32 proj;
+  od_val16 proj_1;
+  od_val32 l2r;
+#if !defined(OD_FLOAT_PVQ)
+  od_val16 proj_norm;
+  od_val16 l2r_norm;
+  od_val16 rcp;
+  int proj_shift;
+  int l2r_shift;
+  int outshift;
+  /*FIXME: Can we get l2r and/or l2r_shift from an earlier computation?*/
+  l2r = 0;
+  for (i = 0; i < n; i++) {
+    l2r += OD_MULT16_16(r[i], r[i]);
+  }
+  /* Apply Householder reflection */
+  proj = 0;
+  for (i = 0; i < n; i++) {
+    proj += OD_MULT16_16(r[i], x[i]);
+  }
+#if defined(OD_FLOAT_PVQ)
+  proj_1 = proj*2./(1e-100 + l2r);
+  for (i = 0; i < n; i++) {
+    out[i] = x[i] - r[i]*proj_1;
+  }
+  /*l2r_norm is [0.5, 1.0[ in Q15.*/
+  l2r_shift = (OD_ILOG(l2r) - 1) - 14;
+  l2r_norm = OD_VSHR_ROUND(l2r, l2r_shift);
+  rcp = od_rcp(l2r_norm);
+  proj_shift = (OD_ILOG(abs(proj)) - 1) - 14;
+  /*proj_norm is [0.5, 1.0[ in Q15.*/
+  proj_norm = OD_VSHR_ROUND(proj, proj_shift);
+  proj_1 = OD_MULT16_16_Q15(proj_norm, rcp);
+  /*The proj*2. in the float code becomes -1 in the final outshift.
+    The sign of l2r_shift is positive since we're taking the reciprocal of
+     l2r_norm and this is a right shift.*/
+  outshift = OD_MINI(30, OD_RCP_OUTSHIFT - proj_shift - 1 + l2r_shift);
+  if (outshift >= 0) {
+    for (i = 0; i < n; i++) {
+      int32_t tmp;
+      tmp = OD_MULT16_16(r[i], proj_1);
+      tmp = OD_SHR_ROUND(tmp, outshift);
+      out[i] = x[i] - tmp;
+    }
+  }
+  else {
+    /*FIXME: Can we make this case impossible?
+      Right now, if r[] is all zeros except for 1, 2, or 3 ones, and
+       if x[] is all zeros except for large values at the same position as the
+       ones in r[], then we can end up with a shift of -1.*/
+    for (i = 0; i < n; i++) {
+      int32_t tmp;
+      tmp = OD_MULT16_16(r[i], proj_1);
+      tmp = OD_SHL(tmp, -outshift);
+      out[i] = x[i] - tmp;
+    }
+  }
+#if !defined(OD_FLOAT_PVQ)
+#define OD_EXP2_INSHIFT 15
+#define OD_EXP2_FRACSHIFT 15
+#define OD_EXP2_OUTSHIFT 15
+static const int32_t OD_EXP2_C[5] = {32768, 22709, 7913, 1704, 443};
+/*Output is [1.0, 2.0) in Q(OD_EXP2_FRACSHIFT).
+  It does not include the integer offset, which is added in od_exp2 after the
+   final shift).*/
+static int32_t od_exp2_frac(int32_t x)
+  return OD_MULT16_16_Q15(x, (OD_EXP2_C[1] + OD_MULT16_16_Q15(x,
+   (OD_EXP2_C[2] + OD_MULT16_16_Q15(x, (OD_EXP2_C[3]
+   + OD_MULT16_16_Q15(x, OD_EXP2_C[4])))))));
+/** Base-2 exponential approximation (2^x) with Q15 input and output.*/
+static int32_t od_exp2(int32_t x)
+  int integer;
+  int32_t frac;
+  integer = x >> OD_EXP2_INSHIFT;
+  if (integer > 14)
+    return 0x7f000000;
+  else if (integer < -15)
+    return 0;
+  frac = od_exp2_frac(x - OD_SHL(integer, OD_EXP2_INSHIFT));
+  return OD_VSHR_ROUND(OD_EXP2_C[0] + frac, -integer) + 1;
+#define OD_LOG2_INSHIFT 15
+#define OD_LOG2_OUTSHIFT 15
+#define OD_LOG2_INSCALE_1 (1./(1 << OD_LOG2_INSHIFT))
+static int16_t od_log2(int16_t x)
+  return x + OD_MULT16_16_Q15(x, (14482 + OD_MULT16_16_Q15(x, (-23234
+   + OD_MULT16_16_Q15(x, (13643 + OD_MULT16_16_Q15(x, (-6403
+   + OD_MULT16_16_Q15(x, 1515)))))))));
+static int32_t od_pow(int32_t x, od_val16 beta)
+  int16_t t;
+  int xshift;
+  int log2_x;
+  od_val32 logr;
+  /*FIXME: this conditional is to avoid doing log2(0).*/
+  if (x == 0)
+    return 0;
+  log2_x = (OD_ILOG(x) - 1);
+  xshift = log2_x - OD_LOG2_INSHIFT;
+  /*t should be in range [0.0, 1.0[ in Q(OD_LOG2_INSHIFT).*/
+  t = OD_VSHR(x, xshift) - (1 << OD_LOG2_INSHIFT);
+  /*log2(g/OD_COMPAND_SCALE) = log2(x) - OD_COMPAND_SHIFT in
+     Q(OD_LOG2_OUTSHIFT).*/
+  logr = od_log2(t) + (log2_x - OD_COMPAND_SHIFT)*OD_LOG2_OUTSCALE;
+  logr = OD_MULT16_32_QBETA(beta, logr);
+  return od_exp2(logr);
+/** Gain companding: raises gain to the power 1/beta for activity masking.
+ *
+ * @param [in]  g     real (uncompanded) gain
+ * @param [in]  q0    uncompanded quality parameter
+ * @param [in]  beta  activity masking beta param (exponent)
+ * @return            g^(1/beta)
+ */
+static od_val32 od_gain_compand(od_val32 g, int q0, od_val16 beta) {
+#if defined(OD_FLOAT_PVQ)
+  if (beta == 1) return OD_ROUND32(OD_CGAIN_SCALE*g/(double)q0);
+  else {
+     1./beta)/(double)q0);
+  }
+  if (beta == OD_BETA(1)) return (OD_CGAIN_SCALE*g + (q0 >> 1))/q0;
+  else {
+    int32_t expr;
+    /*FIXME: This is 1/beta in Q(BETA_SHIFT), should use od_rcp() instead.*/
+    expr = od_pow(g, OD_ROUND16((1 << (2*OD_BETA_SHIFT))/(double)beta));
+    return (expr + (q0 >> 1))/q0;
+  }
+#if !defined(OD_FLOAT_PVQ)
+#define OD_SQRT_INSHIFT 16
+#define OD_SQRT_OUTSHIFT 15
+static int16_t od_rsqrt_norm(int16_t x);
+static int16_t od_sqrt_norm(int32_t x)
+  OD_ASSERT(x < 65536);
+  return OD_MINI(OD_SHR_ROUND(x*od_rsqrt_norm(x), OD_SQRT_OUTSHIFT), 32767);
+static int16_t od_sqrt(int32_t x, int *sqrt_shift)
+  int k;
+  int s;
+  int32_t t;
+  if (x == 0) {
+    *sqrt_shift = 0;
+     return 0;
+  }
+  OD_ASSERT(x < (1 << 30));
+  k = ((OD_ILOG(x) - 1) >> 1);
+  /*t is x in the range [0.25, 1) in QINSHIFT, or x*2^(-s).
+    Shift by log2(x) - log2(0.25*(1 << INSHIFT)) to ensure 0.25 lower bound.*/
+  s = 2*k - (OD_SQRT_INSHIFT - 2);
+  t = OD_VSHR(x, s);
+  /*We want to express od_sqrt() in terms of od_sqrt_norm(), which is
+     defined as (2^OUTSHIFT)*sqrt(t*(2^-INSHIFT)) with t=x*(2^-s).
+    This simplifies to 2^(OUTSHIFT-(INSHIFT/2)-(s/2))*sqrt(x), so the caller
+     needs to shift right by OUTSHIFT - INSHIFT/2 - s/2.*/
+  *sqrt_shift = OD_SQRT_OUTSHIFT - ((s + OD_SQRT_INSHIFT) >> 1);
+  return od_sqrt_norm(t);
+/** Gain expanding: raises gain to the power beta for activity masking.
+ *
+ * @param [in]  cg    companded gain
+ * @param [in]  q0    uncompanded quality parameter
+ * @param [in]  beta  activity masking beta param (exponent)
+ * @return            g^beta
+ */
+od_val32 od_gain_expand(od_val32 cg0, int q0, od_val16 beta) {
+  if (beta == OD_BETA(1)) {
+    /*The multiply fits into 28 bits because the expanded gain has a range from
+       0 to 2^20.*/
+    return OD_SHR_ROUND(cg0*q0, OD_CGAIN_SHIFT);
+  }
+  else if (beta == OD_BETA(1.5)) {
+#if defined(OD_FLOAT_PVQ)
+    double cg;
+    cg = cg0*OD_CGAIN_SCALE_1;
+    cg *= q0*OD_COMPAND_SCALE_1;
+    return OD_ROUND32(OD_COMPAND_SCALE*cg*sqrt(cg));
+    int32_t irt;
+    int64_t tmp;
+    int sqrt_inshift;
+    int sqrt_outshift;
+    /*cg0 is in Q(OD_CGAIN_SHIFT) and we need to divide it by
+       2^OD_COMPAND_SHIFT.*/
+    irt = od_sqrt(cg0*q0, &sqrt_outshift);
+    sqrt_inshift = (OD_CGAIN_SHIFT + OD_COMPAND_SHIFT) >> 1;
+    /*tmp is in Q(OD_CGAIN_SHIFT + OD_COMPAND_SHIFT).*/
+    tmp = cg0*q0*(int64_t)irt;
+    /*Expanded gain must be in Q(OD_COMPAND_SHIFT), thus OD_COMPAND_SHIFT is
+       not included here.*/
+    return OD_VSHR_ROUND(tmp, OD_CGAIN_SHIFT + sqrt_outshift + sqrt_inshift);
+  }
+  else {
+#if defined(OD_FLOAT_PVQ)
+    /*Expanded gain must be in Q(OD_COMPAND_SHIFT), hence the multiply by
+    double cg;
+    cg = cg0*OD_CGAIN_SCALE_1;
+    return OD_ROUND32(OD_COMPAND_SCALE*pow(cg*q0*OD_COMPAND_SCALE_1, beta));
+    int32_t expr;
+    int32_t cg;
+    cg = OD_SHR_ROUND(cg0*q0, OD_CGAIN_SHIFT);
+    expr = od_pow(cg, beta);
+    /*Expanded gain must be in Q(OD_COMPAND_SHIFT), hence the subtraction by
+  }
+/** Computes the raw and quantized/companded gain of a given input
+ * vector
+ *
+ * @param [in]      x      vector of input data
+ * @param [in]      n      number of elements in vector x
+ * @param [in]      q0     quantizer
+ * @param [out]     g      raw gain
+ * @param [in]      beta   activity masking beta param
+ * @param [in]      bshift shift to be applied to raw gain
+ * @return                 quantized/companded gain
+ */
+od_val32 od_pvq_compute_gain(const od_val16 *x, int n, int q0, od_val32 *g,
+ od_val16 beta, int bshift) {
+  int i;
+  od_val32 acc;
+#if !defined(OD_FLOAT_PVQ)
+  od_val32 irt;
+  int sqrt_shift;
+  OD_UNUSED(bshift);
+  acc = 0;
+  for (i = 0; i < n; i++) {
+    acc += x[i]*(od_val32)x[i];
+  }
+#if defined(OD_FLOAT_PVQ)
+  *g = sqrt(acc);
+  irt = od_sqrt(acc, &sqrt_shift);
+  *g = OD_VSHR_ROUND(irt, sqrt_shift - bshift);
+  /* Normalize gain by quantization step size and apply companding
+     (if ACTIVITY != 1). */
+  return od_gain_compand(*g, q0, beta);
+static od_val16 od_beta_rcp(od_val16 beta){
+  if (beta == OD_BETA(1.))
+    return OD_BETA(1.);
+  else if (beta == OD_BETA(1.5))
+    return OD_BETA(1./1.5);
+  else {
+    od_val16 rcp_beta;
+    /*Shift by 1 less, transposing beta to range [.5, .75] and thus < 32768.*/
+    rcp_beta = od_rcp(beta << (OD_RCP_INSHIFT - 1 - OD_BETA_SHIFT));
+    return OD_SHR_ROUND(rcp_beta, OD_RCP_OUTSHIFT + 1 - OD_BETA_SHIFT);
+  }
+/** Compute theta quantization range from quantized/companded gain
+ *
+ * @param [in]      qcg    quantized companded gain value
+ * @param [in]      beta   activity masking beta param
+ * @return                 max theta value
+ */
+int od_pvq_compute_max_theta(od_val32 qcg, od_val16 beta){
+  /* Set angular resolution (in ra) to match the encoded gain */
+#if defined(OD_FLOAT_PVQ)
+  int ts = (int)floor(.5 + qcg*OD_CGAIN_SCALE_1*M_PI/(2*beta));
+  int ts = OD_SHR_ROUND(qcg*OD_MULT16_16_QBETA(OD_QCONST32(M_PI/2,
+   OD_CGAIN_SHIFT), od_beta_rcp(beta)), OD_CGAIN_SHIFT*2);
+  /* Special case for low gains -- will need to be tuned anyway */
+  if (qcg < OD_QCONST32(1.4, OD_CGAIN_SHIFT)) ts = 1;
+  return ts;
+/** Decode quantized theta value from coded value
+ *
+ * @param [in]      t          quantized companded gain value
+ * @param [in]      max_theta  maximum theta value
+ * @return                     decoded theta value
+ */
+od_val32 od_pvq_compute_theta(int t, int max_theta) {
+  if (max_theta != 0) {
+#if defined(OD_FLOAT_PVQ)
+    return OD_MINI(t, max_theta - 1)*.5*M_PI/max_theta;
+    return (OD_MAX_THETA_SCALE*OD_MINI(t, max_theta - 1)
+     + (max_theta >> 1))/max_theta;
+  }
+  else return 0;
+#define OD_ITHETA_SHIFT 15
+/** Compute the number of pulses used for PVQ encoding a vector from
+ * available metrics (encode and decode side)
+ *
+ * @param [in]      qcg        quantized companded gain value
+ * @param [in]      itheta     quantized PVQ error angle theta
+ * @param [in]      theta      PVQ error angle theta
+ * @param [in]      noref      indicates present or lack of reference
+ *                             (prediction)
+ * @param [in]      n          number of elements to be coded
+ * @param [in]      beta       activity masking beta param
+ * @param [in]      nodesync   do not use info that depends on the reference
+ * @return                     number of pulses to use for coding
+ */
+int od_pvq_compute_k(od_val32 qcg, int itheta, od_val32 theta, int noref, int n,
+ od_val16 beta, int nodesync) {
+  if (noref) {
+    if (qcg == 0) return 0;
+    if (n == 15 && qcg == OD_CGAIN_SCALE && beta > OD_BETA(1.25)) {
+      return 1;
+    }
+    else {
+#if defined(OD_FLOAT_PVQ)
+      return OD_MAXI(1, (int)floor(.5 + (qcg*OD_CGAIN_SCALE_1 - .2)*
+       sqrt((n + 3)/2)/beta));
+      od_val32 rt;
+      int sqrt_shift;
+      rt = od_sqrt((n + 3) >> 1, &sqrt_shift);
+      /*FIXME: get rid of 64-bit mul.*/
+      return OD_MAXI(1, OD_SHR_ROUND((int64_t)((qcg
+       - (int64_t)OD_QCONST32(.2, OD_CGAIN_SHIFT))*rt/(beta*OD_BETA_SCALE_1)),
+       OD_CGAIN_SHIFT + sqrt_shift));
+    }
+  }
+  else {
+    if (itheta == 0) return 0;
+    /* Sets K according to gain and theta, based on the high-rate
+       PVQ distortion curves (see PVQ document). Low-rate will have to be
+       perceptually tuned anyway. We subtract 0.2 from the radius as an
+       approximation for the fact that the coefficients aren't identically
+       distributed within a band so at low gain the number of dimensions that
+       are likely to have a pulse is less than n. */
+    if (nodesync) {
+#if defined(OD_FLOAT_PVQ)
+      return OD_MAXI(1, (int)floor(.5 + (itheta - .2)*sqrt((n + 2)/2)));
+      od_val32 rt;
+      int sqrt_outshift;
+      rt = od_sqrt((n + 2)/2, &sqrt_outshift);
+      /*FIXME: get rid of 64-bit mul.*/
+      return OD_MAXI(1, OD_VSHR_ROUND(((OD_SHL(itheta, OD_ITHETA_SHIFT)
+       - OD_QCONST32(.2, OD_ITHETA_SHIFT)))*(int64_t)rt,
+       sqrt_outshift + OD_ITHETA_SHIFT));
+    }
+    else {
+      return OD_MAXI(1, (int)floor(.5 + (qcg*OD_CGAIN_SCALE_1*
+       od_pvq_sin(theta)*OD_TRIG_SCALE_1 - .2)*sqrt((n
+       + 2)/2)/(beta*OD_BETA_SCALE_1)));
+    }
+  }
+#if !defined(OD_FLOAT_PVQ)
+#define OD_RSQRT_INSHIFT 16
+/** Reciprocal sqrt approximation where the input is in the range [0.25,1) in
+     Q16 and the output is in the range (1.0, 2.0] in Q14).
+    Error is always within +/1 of round(1/sqrt(t))*/
+static int16_t od_rsqrt_norm(int16_t t)
+  int16_t n;
+  int32_t r;
+  int32_t r2;
+  int32_t ry;
+  int32_t y;
+  int32_t ret;
+  /* Range of n is [-16384,32767] ([-0.5,1) in Q15).*/
+  n = t - 32768;
+  OD_ASSERT(n >= -16384);
+  /*Get a rough initial guess for the root.
+    The optimal minimax quadratic approximation (using relative error) is
+     r = 1.437799046117536+n*(-0.823394375837328+n*0.4096419668459485).
+    Coefficients here, and the final result r, are Q14.*/
+  r = (23565 + OD_MULT16_16_Q15(n, (-13481 + OD_MULT16_16_Q15(n, 6711))));
+  /*We want y = t*r*r-1 in Q15, but t is 32-bit Q16 and r is Q14.
+    We can compute the result from n and r using Q15 multiplies with some
+     adjustment, carefully done to avoid overflow.*/
+  r2 = r*r;
+  y = (((r2 >> 15)*n + r2) >> 12) - 131077;
+  ry = r*y;
+  /*Apply a 2nd-order Householder iteration: r += r*y*(y*0.375-0.5).
+    This yields the Q14 reciprocal square root of the Q16 t, with a maximum
+     relative error of 1.04956E-4, a (relative) RMSE of 2.80979E-5, and a peak
+     absolute error of 2.26591/16384.*/
+  ret = r + ((((ry >> 16)*(3*y) >> 3) - ry) >> 18);
+  OD_ASSERT(ret >= 16384 && ret < 32768);
+  return (int16_t)ret;
+static int16_t od_rsqrt(int32_t x, int *rsqrt_shift)
+   int k;
+   int s;
+   int16_t t;
+   k = (OD_ILOG(x) - 1) >> 1;
+  /*t is x in the range [0.25, 1) in QINSHIFT, or x*2^(-s).
+    Shift by log2(x) - log2(0.25*(1 << INSHIFT)) to ensure 0.25 lower bound.*/
+   s = 2*k - (OD_RSQRT_INSHIFT - 2);
+   t = OD_VSHR(x, s);
+   /*We want to express od_rsqrt() in terms of od_rsqrt_norm(), which is
+      defined as (2^OUTSHIFT)/sqrt(t*(2^-INSHIFT)) with t=x*(2^-s).
+     This simplifies to 2^(OUTSHIFT+(INSHIFT/2)+(s/2))/sqrt(x), so the caller
+      needs to shift right by OUTSHIFT + INSHIFT/2 + s/2.*/
+   *rsqrt_shift = OD_RSQRT_OUTSHIFT + ((s + OD_RSQRT_INSHIFT) >> 1);
+   return od_rsqrt_norm(t);
+/** Synthesizes one parition of coefficient values from a PVQ-encoded
+ * vector.  This 'partial' version is called by the encode loop where
+ * the Householder reflection has already been computed and there's no
+ * need to recompute it.
+ *
+ * @param [out]     xcoeff  output coefficient partition (x in math doc)
+ * @param [in]      ypulse  PVQ-encoded values (y in the math doc); in
+ *                          the noref case, this vector has n entries,
+ *                          in the reference case it contains n-1 entries
+ *                          (the m-th entry is not included)
+ * @param [in]      r       reference vector (prediction)
+ * @param [in]      n       number of elements in this partition
+ * @param [in]      noref   indicates presence or lack of prediction
+ * @param [in]      g       decoded quantized vector gain
+ * @param [in]      theta   decoded theta (prediction error)
+ * @param [in]      m       alignment dimension of Householder reflection
+ * @param [in]      s       sign of Householder reflection
+ * @param [in]      qm_inv  inverse of the QM with magnitude compensation
+ */
+void od_pvq_synthesis_partial(od_coeff *xcoeff, const od_coeff *ypulse,
+ const od_val16 *r16, int n, int noref, od_val32 g, od_val32 theta, int m, int s,
+ const int16_t *qm_inv) {
+  int i;
+  int yy;
+  od_val32 scale;
+  int nn;
+  int gshift;
+  int qshift;
+  OD_ASSERT(g != 0);
+  nn = n-(!noref); /* when noref==0, vector in is sized n-1 */
+  yy = 0;
+  for (i = 0; i < nn; i++)
+    yy += ypulse[i]*(int32_t)ypulse[i];
+  /* Shift required for the magnitude of the pre-qm synthesis to be guaranteed
+     to fit in 16 bits. In practice, the range will be 8192-16384 after scaling
+     most of the time. */
+  gshift = OD_MAXI(0, OD_ILOG(g) - 14);
+  /*scale is g/sqrt(yy) in Q(16-gshift) so that x[]*scale has a norm that fits
+     in 16 bits.*/
+  if (yy == 0) scale = 0;
+#if defined(OD_FLOAT_PVQ)
+  else {
+    scale = g/sqrt(yy);
+  }
+  OD_UNUSED(gshift);
+  OD_UNUSED(qshift);
+  else {
+    int rsqrt_shift;
+    int16_t rsqrt;
+    /*FIXME: should be < int64_t*/
+    int64_t tmp;
+    rsqrt = od_rsqrt(yy, &rsqrt_shift);
+    tmp = rsqrt*(int64_t)g;
+    scale = OD_VSHR_ROUND(tmp, rsqrt_shift + gshift - 16);
+  }
+  /* Shift to apply after multiplying by the inverse QM, taking into account
+     gshift. */
+  qshift = OD_QM_INV_SHIFT - gshift;
+  if (noref) {
+    for (i = 0; i < n; i++) {
+      od_val32 x;
+      /* This multiply doesn't round, so it introduces some bias.
+         It would be nice (but not critical) to fix this. */
+      x = OD_MULT16_32_Q16(ypulse[i], scale);
+#if defined(OD_FLOAT_PVQ)
+      xcoeff[i] = (od_coeff)floor(.5
+       + x*(qm_inv[i]*OD_QM_INV_SCALE_1));
+      xcoeff[i] = OD_SHR_ROUND(x*qm_inv[i], qshift);
+    }
+  }
+  else{
+    od_val16 x[MAXN];
+    scale = OD_ROUND32(scale*OD_TRIG_SCALE_1*od_pvq_sin(theta));
+    /* The following multiply doesn't round, but it's probably OK since
+       the Householder reflection is likely to undo most of the resulting
+       bias. */
+    for (i = 0; i < m; i++)
+      x[i] = OD_MULT16_32_Q16(ypulse[i], scale);
+    x[m] = OD_ROUND16(-s*(OD_SHR_ROUND(g, gshift))*OD_TRIG_SCALE_1*
+     od_pvq_cos(theta));
+    for (i = m; i < nn; i++)
+      x[i+1] = OD_MULT16_32_Q16(ypulse[i], scale);
+    od_apply_householder(x, x, r16, n);
+    for (i = 0; i < n; i++) {
+#if defined(OD_FLOAT_PVQ)
+      xcoeff[i] = (od_coeff)floor(.5 + (x[i]*(qm_inv[i]*OD_QM_INV_SCALE_1)));
+      xcoeff[i] = OD_SHR_ROUND(x[i]*qm_inv[i], qshift);
+    }
+  }
diff --git a/av1/common/pvq.h b/av1/common/pvq.h
new file mode 100644
index 0000000..a5051b4
--- /dev/null
+++ b/av1/common/pvq.h
@@ -0,0 +1,168 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+/* clang-format off */
+#if !defined(_pvq_H)
+# define _pvq_H (1)
+# include "generic_code.h"
+# include "odintrin.h"
+extern const double *OD_BASIS_MAG[2][OD_NBSIZES + 1];
+extern const int OD_QM8_Q4_FLAT[];
+extern const int OD_QM8_Q4_HVS[];
+extern const uint16_t EXP_CDF_TABLE[][16];
+extern const uint16_t LAPLACE_OFFSET[];
+# define PVQ_MAX_PARTITIONS (1 + 3*(OD_NBSIZES-1))
+# define OD_NOREF_ADAPT_SPEED (4)
+/* Normalized lambda for PVQ quantizer. Since we normalize the gain by q, the
+   distortion is normalized by q^2 and lambda does not need the q^2 factor.
+   At high rate, this would be log(2)/6, but we're using a slightly more
+   aggressive value, closer to:
+   Li, Xiang, et al. "Laplace distribution based Lagrangian rate distortion
+   optimization for hybrid video coding." Circuits and Systems for Video
+   Technology, IEEE Transactions on 19.2 (2009): 193-205.
+   */
+# define OD_PVQ_LAMBDA (.1146)
+#define OD_PVQ_SKIP_ZERO 1
+#define OD_PVQ_SKIP_COPY 2
+/* Maximum size for coding a PVQ band. */
+#define OD_MAX_PVQ_SIZE (1024)
+#if defined(OD_FLOAT_PVQ)
+#define OD_QM_SHIFT (15)
+#define OD_QM_SHIFT (11)
+#define OD_QM_SCALE (1 << OD_QM_SHIFT)
+#if defined(OD_FLOAT_PVQ)
+#define OD_QM_SCALE_1 (1./OD_QM_SCALE)
+#define OD_QM_SCALE_MAX 32767
+#define OD_QM_INV_SHIFT (12)
+#if defined(OD_FLOAT_PVQ)
+#define OD_QM_OFFSET(bs) ((((1 << 2*bs) - 1) << 2*OD_LOG_BSIZE0)/3)
+#if !defined(OD_FLOAT_PVQ)
+#define OD_THETA_SHIFT (15)
+#define OD_THETA_SCALE ((1 << OD_THETA_SHIFT)*2./M_PI)
+#define OD_TRIG_SCALE (32768)
+#define OD_BETA_SHIFT (12)
+#define OD_BETA_SCALE_1 (1./(1 << OD_BETA_SHIFT))
+/*Multiplies 16-bit a by 32-bit b and keeps bits [16:64-OD_BETA_SHIFT-1].*/
+#define OD_MULT16_32_QBETA(a, b) \
+ ((int16_t)(a)*(int64_t)(int32_t)(b) >> OD_BETA_SHIFT)
+# define OD_MULT16_16_QBETA(a, b) \
+  ((((int16_t)(a))*((int32_t)(int16_t)(b))) >> OD_BETA_SHIFT)
+#define OD_CGAIN_SHIFT (8)
+#define OD_BETA_SCALE_1 (1.)
+#define OD_THETA_SCALE (1)
+#define OD_TRIG_SCALE (1)
+#define OD_CGAIN_SCALE (1)
+/* Largest PVQ partition is half the coefficients of largest block size. */
+#define OD_FLAT_QM 0
+#define OD_HVS_QM  1
+# define OD_NSB_ADAPT_CTXS (4)
+# define OD_ADAPT_K_Q8        0
+# define OD_ADAPT_SUM_EX_Q8   1
+# define OD_ADAPT_COUNT_Q8    2
+# define OD_ADAPT_COUNT_EX_Q8 3
+# define OD_ADAPT_NO_VALUE (-2147483647-1)
+typedef struct od_pvq_adapt_ctx  od_pvq_adapt_ctx;
+typedef struct od_pvq_codeword_ctx od_pvq_codeword_ctx;
+struct od_pvq_codeword_ctx {
+  int                 pvq_adapt[2*OD_NBSIZES*OD_NSB_ADAPT_CTXS];
+  int                 pvq_k1_increment;
+  /* CDFs are size 16 despite the fact that we're using less than that. */
+  uint16_t            pvq_k1_cdf[12][16];
+  uint16_t            pvq_split_cdf[22*7][8];
+  int                 pvq_split_increment;
+struct od_pvq_adapt_ctx {
+  od_pvq_codeword_ctx pvq_codeword_ctx;
+  generic_encoder     pvq_param_model[3];
+  int                 pvq_ext[OD_NBSIZES*PVQ_MAX_PARTITIONS];
+  int                 pvq_exg[OD_NPLANES_MAX][OD_NBSIZES][PVQ_MAX_PARTITIONS];
+  int                 pvq_gaintheta_increment;
+  uint16_t        pvq_gaintheta_cdf[2*OD_NBSIZES*PVQ_MAX_PARTITIONS][16];
+  int                 pvq_skip_dir_increment;
+  uint16_t        pvq_skip_dir_cdf[2*(OD_NBSIZES-1)][7];
+void od_adapt_pvq_ctx_reset(od_pvq_adapt_ctx *state, int is_keyframe);
+int od_pvq_size_ctx(int n);
+int od_pvq_k1_ctx(int n, int orig_size);
+od_val16 od_pvq_sin(od_val32 x);
+od_val16 od_pvq_cos(od_val32 x);
+#if !defined(OD_FLOAT_PVQ)
+int od_vector_log_mag(const od_coeff *x, int n);
+int od_qm_get_index(int bs, int band);
+extern const od_val16 *const OD_PVQ_BETA[2][OD_NPLANES_MAX][OD_NBSIZES + 1];
+void od_init_qm(int16_t *x, int16_t *x_inv, const int *qm);
+int od_compute_householder(od_val16 *r, int n, od_val32 gr, int *sign,
+ int shift);
+void od_apply_householder(od_val16 *out, const od_val16 *x, const od_val16 *r,
+ int n);
+void od_pvq_synthesis_partial(od_coeff *xcoeff, const od_coeff *ypulse,
+                                  const od_val16 *r, int n,
+                                  int noref, od_val32 g,
+                                  od_val32 theta, int m, int s,
+                                  const int16_t *qm_inv);
+od_val32 od_gain_expand(od_val32 cg, int q0, od_val16 beta);
+od_val32 od_pvq_compute_gain(const od_val16 *x, int n, int q0, od_val32 *g,
+ od_val16 beta, int bshift);
+int od_pvq_compute_max_theta(od_val32 qcg, od_val16 beta);
+od_val32 od_pvq_compute_theta(int t, int max_theta);
+int od_pvq_compute_k(od_val32 qcg, int itheta, od_val32 theta, int noref,
+ int n, od_val16 beta, int nodesync);
+int od_vector_is_null(const od_coeff *x, int len);
+int od_qm_offset(int bs, int xydec);
diff --git a/av1/common/pvq_state.c b/av1/common/pvq_state.c
new file mode 100644
index 0000000..45d5184
--- /dev/null
+++ b/av1/common/pvq_state.c
@@ -0,0 +1,53 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+#include "av1/common/pvq_state.h"
+#include "av1/common/odintrin.h"
+void od_adapt_ctx_reset(od_adapt_ctx *adapt, int is_keyframe) {
+  int i;
+  int pli;
+  od_adapt_pvq_ctx_reset(&adapt->pvq, is_keyframe);
+  adapt->skip_increment = 128;
+  OD_CDFS_INIT(adapt->skip_cdf, adapt->skip_increment >> 2);
+  for (pli = 0; pli < OD_NPLANES_MAX; pli++) {
+    generic_model_init(&adapt->model_dc[pli]);
+    for (i = 0; i < OD_NBSIZES; i++) {
+      adapt->ex_g[pli][i] = 8;
+    }
+    for (i = 0; i < 4; i++) {
+      int j;
+      for (j = 0; j < 3; j++) {
+        adapt->ex_dc[pli][i][j] = pli > 0 ? 8 : 32768;
+      }
+    }
+  }
+void od_init_skipped_coeffs(int16_t *d, int16_t *pred, int is_keyframe, int bo,
+                            int n, int w) {
+  int i;
+  int j;
+  if (is_keyframe) {
+    for (i = 0; i < n; i++) {
+      for (j = 0; j < n; j++) {
+        /* skip DC */
+        if (i || j) d[bo + i * w + j] = 0;
+      }
+    }
+  } else {
+    for (i = 0; i < n; i++) {
+      for (j = 0; j < n; j++) {
+        d[bo + i * w + j] = pred[i * n + j];
+      }
+    }
+  }
diff --git a/av1/common/pvq_state.h b/av1/common/pvq_state.h
new file mode 100644
index 0000000..6cf56fe
--- /dev/null
+++ b/av1/common/pvq_state.h
@@ -0,0 +1,56 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+/* clang-format off */
+#if !defined(_state_H)
+# define _state_H (1)
+typedef struct od_state     od_state;
+typedef struct od_adapt_ctx od_adapt_ctx;
+# include "generic_code.h"
+# include "odintrin.h"
+# include "pvq.h"
+/*Adaptation speed of scalar Laplace encoding.*/
+struct od_adapt_ctx {
+  /* Support for PVQ encode/decode */
+  od_pvq_adapt_ctx pvq;
+  generic_encoder model_dc[OD_NPLANES_MAX];
+  int ex_dc[OD_NPLANES_MAX][OD_NBSIZES][3];
+  /* Joint skip flag for DC and AC */
+  uint16_t skip_cdf[OD_NBSIZES*2][4];
+  int skip_increment;
+struct od_state {
+  od_adapt_ctx adapt;
+  /* TODO(yushin): Enable this for activity masking,
+     when pvq_qm_q4 is available in AOM. */
+  /* unsigned char pvq_qm_q4[OD_NPLANES_MAX][OD_QM_SIZE]; */
+  /* Quantization matrices and their inverses. */
+  int16_t qm[OD_QM_BUFFER_SIZE];
+  int16_t qm_inv[OD_QM_BUFFER_SIZE];
+void od_adapt_ctx_reset(od_adapt_ctx *state, int is_keyframe);
+void od_init_skipped_coeffs(int16_t *d, int16_t *pred, int is_keyframe,
+ int bo, int n, int w);
diff --git a/av1/common/zigzag.h b/av1/common/zigzag.h
new file mode 100644
index 0000000..295ed23
--- /dev/null
+++ b/av1/common/zigzag.h
@@ -0,0 +1,33 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+/* clang-format off */
+#if !defined(_zigzag_H)
+# define _zigzag_H (1)
+extern const unsigned char OD_ZIGZAG4_DCT_DCT[15][2];
+extern const unsigned char OD_ZIGZAG4_ADST_DCT[15][2];
+extern const unsigned char OD_ZIGZAG4_DCT_ADST[15][2];
+extern const unsigned char OD_ZIGZAG4_ADST_ADST[15][2];
+extern const unsigned char OD_ZIGZAG8_DCT_DCT[48][2];
+extern const unsigned char OD_ZIGZAG8_ADST_DCT[48][2];
+extern const unsigned char OD_ZIGZAG8_DCT_ADST[48][2];
+extern const unsigned char OD_ZIGZAG8_ADST_ADST[48][2];
+extern const unsigned char OD_ZIGZAG16_DCT_DCT[192][2];
+extern const unsigned char OD_ZIGZAG16_ADST_DCT[192][2];
+extern const unsigned char OD_ZIGZAG16_DCT_ADST[192][2];
+extern const unsigned char OD_ZIGZAG16_ADST_ADST[192][2];
+extern const unsigned char OD_ZIGZAG32_DCT_DCT[768][2];
diff --git a/av1/common/zigzag16.c b/av1/common/zigzag16.c
new file mode 100644
index 0000000..94c3487
--- /dev/null
+++ b/av1/common/zigzag16.c
@@ -0,0 +1,208 @@
+/* This file is generated by gen_zigzag16.m */
+/* clang-format off */
+#include "odintrin.h"
+OD_EXTERN const unsigned char OD_ZIGZAG16_DCT_DCT[192][2] = {
+  {8, 0}, {8, 1}, {8, 2}, {9, 0},
+  {8, 3}, {9, 1}, {9, 2}, {10, 0},
+  {9, 3}, {10, 1}, {10, 2}, {11, 0},
+  {10, 3}, {11, 1}, {11, 2}, {11, 3},
+  {12, 0}, {12, 1}, {13, 0}, {12, 2},
+  {12, 3}, {13, 1}, {13, 2}, {14, 0},
+  {13, 3}, {14, 1}, {15, 0}, {14, 2},
+  {14, 3}, {15, 1}, {15, 2}, {15, 3},
+  {0, 8}, {1, 8}, {0, 9}, {2, 8},
+  {1, 9}, {3, 8}, {0, 10}, {2, 9},
+  {1, 10}, {3, 9}, {0, 11}, {2, 10},
+  {1, 11}, {3, 10}, {0, 12}, {2, 11},
+  {1, 12}, {3, 11}, {0, 13}, {2, 12},
+  {1, 13}, {0, 14}, {3, 12}, {2, 13},
+  {1, 14}, {3, 13}, {0, 15}, {2, 14},
+  {1, 15}, {3, 14}, {2, 15}, {3, 15},
+  {4, 8}, {5, 8}, {4, 9}, {8, 4},
+  {8, 5}, {6, 8}, {5, 9}, {4, 10},
+  {9, 4}, {8, 6}, {7, 8}, {9, 5},
+  {5, 10}, {8, 7}, {6, 9}, {4, 11},
+  {10, 4}, {9, 6}, {7, 9}, {8, 8},
+  {10, 5}, {6, 10}, {5, 11}, {9, 7},
+  {8, 9}, {10, 6}, {7, 10}, {4, 12},
+  {11, 4}, {9, 8}, {6, 11}, {10, 7},
+  {11, 5}, {5, 12}, {8, 10}, {7, 11},
+  {9, 9}, {4, 13}, {10, 8}, {11, 6},
+  {11, 7}, {6, 12}, {8, 11}, {9, 10},
+  {12, 4}, {5, 13}, {10, 9}, {12, 5},
+  {7, 12}, {11, 8}, {4, 14}, {6, 13},
+  {10, 10}, {9, 11}, {12, 6}, {13, 4},
+  {11, 9}, {8, 12}, {5, 14}, {12, 7},
+  {7, 13}, {4, 15}, {13, 5}, {10, 11},
+  {11, 10}, {9, 12}, {13, 6}, {12, 8},
+  {6, 14}, {8, 13}, {5, 15}, {13, 7},
+  {14, 4}, {12, 9}, {7, 14}, {11, 11},
+  {10, 12}, {9, 13}, {14, 5}, {6, 15},
+  {13, 8}, {8, 14}, {12, 10}, {14, 6},
+  {7, 15}, {13, 9}, {15, 4}, {10, 13},
+  {11, 12}, {14, 7}, {9, 14}, {12, 11},
+  {8, 15}, {15, 5}, {13, 10}, {14, 8},
+  {11, 13}, {15, 6}, {9, 15}, {10, 14},
+  {14, 9}, {15, 7}, {13, 11}, {12, 12},
+  {10, 15}, {11, 14}, {15, 8}, {14, 10},
+  {12, 13}, {13, 12}, {15, 9}, {11, 15},
+  {14, 11}, {13, 13}, {15, 10}, {12, 14},
+  {13, 14}, {15, 11}, {14, 12}, {12, 15},
+  {14, 13}, {13, 15}, {15, 12}, {14, 14},
+  {15, 13}, {14, 15}, {15, 14}, {15, 15}
+  };
+OD_EXTERN const unsigned char OD_ZIGZAG16_ADST_DCT[192][2] = {
+  {8, 0}, {9, 0}, {10, 0}, {8, 1},
+  {11, 0}, {9, 1}, {8, 2}, {12, 0},
+  {10, 1}, {9, 2}, {8, 3}, {13, 0},
+  {11, 1}, {10, 2}, {9, 3}, {14, 0},
+  {12, 1}, {10, 3}, {15, 0}, {11, 2},
+  {13, 1}, {11, 3}, {12, 2}, {14, 1},
+  {12, 3}, {13, 2}, {15, 1}, {13, 3},
+  {14, 2}, {14, 3}, {15, 2}, {15, 3},
+  {0, 8}, {1, 8}, {2, 8}, {0, 9},
+  {3, 8}, {1, 9}, {2, 9}, {0, 10},
+  {3, 9}, {1, 10}, {2, 10}, {0, 11},
+  {3, 10}, {1, 11}, {2, 11}, {0, 12},
+  {3, 11}, {1, 12}, {2, 12}, {0, 13},
+  {3, 12}, {1, 13}, {0, 14}, {2, 13},
+  {0, 15}, {1, 14}, {3, 13}, {2, 14},
+  {1, 15}, {3, 14}, {2, 15}, {3, 15},
+  {8, 4}, {9, 4}, {8, 5}, {4, 8},
+  {10, 4}, {9, 5}, {5, 8}, {8, 6},
+  {4, 9}, {10, 5}, {9, 6}, {6, 8},
+  {8, 7}, {11, 4}, {7, 8}, {5, 9},
+  {9, 7}, {11, 5}, {10, 6}, {4, 10},
+  {6, 9}, {8, 8}, {5, 10}, {7, 9},
+  {12, 4}, {10, 7}, {9, 8}, {11, 6},
+  {8, 9}, {4, 11}, {6, 10}, {7, 10},
+  {12, 5}, {5, 11}, {10, 8}, {11, 7},
+  {9, 9}, {4, 12}, {13, 4}, {8, 10},
+  {6, 11}, {12, 6}, {5, 12}, {10, 9},
+  {7, 11}, {9, 10}, {11, 8}, {13, 5},
+  {8, 11}, {4, 13}, {6, 12}, {10, 10},
+  {12, 7}, {11, 9}, {7, 12}, {14, 4},
+  {5, 13}, {9, 11}, {13, 6}, {8, 12},
+  {4, 14}, {12, 8}, {6, 13}, {11, 10},
+  {10, 11}, {12, 9}, {5, 14}, {13, 7},
+  {14, 5}, {9, 12}, {4, 15}, {7, 13},
+  {8, 13}, {6, 14}, {13, 8}, {11, 11},
+  {10, 12}, {15, 4}, {12, 10}, {14, 6},
+  {13, 9}, {5, 15}, {9, 13}, {7, 14},
+  {15, 5}, {6, 15}, {8, 14}, {14, 7},
+  {11, 12}, {7, 15}, {9, 14}, {13, 10},
+  {10, 13}, {14, 8}, {15, 6}, {14, 9},
+  {12, 11}, {8, 15}, {15, 7}, {10, 14},
+  {11, 13}, {9, 15}, {13, 11}, {12, 12},
+  {15, 8}, {14, 10}, {15, 9}, {10, 15},
+  {11, 14}, {13, 12}, {12, 13}, {15, 10},
+  {14, 11}, {11, 15}, {13, 13}, {15, 11},
+  {14, 12}, {12, 14}, {15, 12}, {13, 14},
+  {12, 15}, {14, 13}, {13, 15}, {15, 13},
+  {14, 14}, {15, 14}, {14, 15}, {15, 15}
+  };
+OD_EXTERN const unsigned char OD_ZIGZAG16_DCT_ADST[192][2] = {
+  {8, 0}, {8, 1}, {8, 2}, {8, 3},
+  {9, 0}, {9, 1}, {9, 2}, {9, 3},
+  {10, 0}, {10, 1}, {10, 2}, {10, 3},
+  {11, 0}, {11, 1}, {11, 2}, {11, 3},
+  {12, 0}, {12, 1}, {12, 2}, {12, 3},
+  {13, 0}, {13, 1}, {13, 2}, {13, 3},
+  {14, 0}, {15, 0}, {14, 1}, {14, 2},
+  {14, 3}, {15, 1}, {15, 2}, {15, 3},
+  {0, 8}, {0, 9}, {0, 10}, {1, 8},
+  {0, 11}, {1, 9}, {2, 8}, {0, 12},
+  {1, 10}, {2, 9}, {0, 13}, {1, 11},
+  {3, 8}, {2, 10}, {0, 14}, {1, 12},
+  {3, 9}, {0, 15}, {2, 11}, {3, 10},
+  {1, 13}, {2, 12}, {3, 11}, {1, 14},
+  {2, 13}, {1, 15}, {3, 12}, {2, 14},
+  {3, 13}, {2, 15}, {3, 14}, {3, 15},
+  {4, 8}, {4, 9}, {5, 8}, {4, 10},
+  {5, 9}, {4, 11}, {6, 8}, {5, 10},
+  {8, 4}, {6, 9}, {4, 12}, {5, 11},
+  {8, 5}, {6, 10}, {7, 8}, {8, 6},
+  {4, 13}, {7, 9}, {5, 12}, {8, 7},
+  {9, 4}, {6, 11}, {8, 8}, {7, 10},
+  {5, 13}, {9, 5}, {4, 14}, {9, 6},
+  {8, 9}, {6, 12}, {9, 7}, {7, 11},
+  {4, 15}, {8, 10}, {9, 8}, {5, 14},
+  {10, 4}, {6, 13}, {10, 5}, {9, 9},
+  {7, 12}, {8, 11}, {10, 6}, {5, 15},
+  {10, 7}, {6, 14}, {9, 10}, {7, 13},
+  {8, 12}, {10, 8}, {9, 11}, {6, 15},
+  {11, 4}, {11, 5}, {10, 9}, {8, 13},
+  {7, 14}, {11, 6}, {9, 12}, {11, 7},
+  {10, 10}, {7, 15}, {8, 14}, {12, 4},
+  {11, 8}, {12, 5}, {9, 13}, {10, 11},
+  {8, 15}, {11, 9}, {12, 6}, {12, 7},
+  {10, 12}, {9, 14}, {11, 10}, {13, 4},
+  {12, 8}, {9, 15}, {13, 5}, {11, 11},
+  {12, 9}, {10, 13}, {13, 6}, {13, 7},
+  {12, 10}, {14, 4}, {11, 12}, {13, 8},
+  {10, 14}, {14, 5}, {12, 11}, {13, 9},
+  {14, 6}, {10, 15}, {11, 13}, {15, 4},
+  {14, 7}, {12, 12}, {13, 10}, {14, 8},
+  {15, 5}, {13, 11}, {15, 6}, {11, 14},
+  {14, 9}, {12, 13}, {11, 15}, {15, 7},
+  {14, 10}, {15, 8}, {13, 12}, {12, 14},
+  {15, 9}, {14, 11}, {13, 13}, {12, 15},
+  {15, 10}, {14, 12}, {13, 14}, {15, 11},
+  {13, 15}, {14, 13}, {14, 14}, {15, 12},
+  {14, 15}, {15, 13}, {15, 14}, {15, 15}
+  };
+OD_EXTERN const unsigned char OD_ZIGZAG16_ADST_ADST[192][2] = {
+  {8, 0}, {8, 1}, {8, 2}, {9, 0},
+  {8, 3}, {9, 1}, {9, 2}, {10, 0},
+  {9, 3}, {10, 1}, {10, 2}, {11, 0},
+  {10, 3}, {11, 1}, {11, 2}, {11, 3},
+  {12, 0}, {12, 1}, {13, 0}, {12, 2},
+  {12, 3}, {13, 1}, {13, 2}, {14, 0},
+  {13, 3}, {14, 1}, {15, 0}, {14, 2},
+  {14, 3}, {15, 1}, {15, 2}, {15, 3},
+  {0, 8}, {1, 8}, {0, 9}, {2, 8},
+  {1, 9}, {3, 8}, {0, 10}, {2, 9},
+  {1, 10}, {3, 9}, {0, 11}, {2, 10},
+  {1, 11}, {3, 10}, {0, 12}, {2, 11},
+  {1, 12}, {3, 11}, {0, 13}, {2, 12},
+  {1, 13}, {0, 14}, {3, 12}, {2, 13},
+  {1, 14}, {3, 13}, {0, 15}, {2, 14},
+  {1, 15}, {3, 14}, {2, 15}, {3, 15},
+  {4, 8}, {5, 8}, {4, 9}, {8, 4},
+  {8, 5}, {6, 8}, {5, 9}, {4, 10},
+  {9, 4}, {8, 6}, {7, 8}, {9, 5},
+  {5, 10}, {8, 7}, {6, 9}, {4, 11},
+  {10, 4}, {9, 6}, {7, 9}, {8, 8},
+  {10, 5}, {6, 10}, {5, 11}, {9, 7},
+  {8, 9}, {10, 6}, {7, 10}, {4, 12},
+  {11, 4}, {9, 8}, {6, 11}, {10, 7},
+  {11, 5}, {5, 12}, {8, 10}, {7, 11},
+  {9, 9}, {4, 13}, {10, 8}, {11, 6},
+  {11, 7}, {6, 12}, {8, 11}, {9, 10},
+  {12, 4}, {5, 13}, {10, 9}, {12, 5},
+  {7, 12}, {11, 8}, {4, 14}, {6, 13},
+  {10, 10}, {9, 11}, {12, 6}, {13, 4},
+  {11, 9}, {8, 12}, {5, 14}, {12, 7},
+  {7, 13}, {4, 15}, {13, 5}, {10, 11},
+  {11, 10}, {9, 12}, {13, 6}, {12, 8},
+  {6, 14}, {8, 13}, {5, 15}, {13, 7},
+  {14, 4}, {12, 9}, {7, 14}, {11, 11},
+  {10, 12}, {9, 13}, {14, 5}, {6, 15},
+  {13, 8}, {8, 14}, {12, 10}, {14, 6},
+  {7, 15}, {13, 9}, {15, 4}, {10, 13},
+  {11, 12}, {14, 7}, {9, 14}, {12, 11},
+  {8, 15}, {15, 5}, {13, 10}, {14, 8},
+  {11, 13}, {15, 6}, {9, 15}, {10, 14},
+  {14, 9}, {15, 7}, {13, 11}, {12, 12},
+  {10, 15}, {11, 14}, {15, 8}, {14, 10},
+  {12, 13}, {13, 12}, {15, 9}, {11, 15},
+  {14, 11}, {13, 13}, {15, 10}, {12, 14},
+  {13, 14}, {15, 11}, {14, 12}, {12, 15},
+  {14, 13}, {13, 15}, {15, 12}, {14, 14},
+  {15, 13}, {14, 15}, {15, 14}, {15, 15}
+  };
diff --git a/av1/common/zigzag32.c b/av1/common/zigzag32.c
new file mode 100644
index 0000000..cb3b9bc
--- /dev/null
+++ b/av1/common/zigzag32.c
@@ -0,0 +1,199 @@
+/* This file is generated by gen_zigzag32.m */
+/* clang-format off */
+#include "odintrin.h"
+OD_EXTERN const unsigned char OD_ZIGZAG32_DCT_DCT[768][2] = {
+  { 16, 0 }, { 17, 0 }, { 18, 0 }, { 19, 0 },
+  { 16, 1 }, { 17, 1 }, { 20, 0 }, { 16, 2 },
+  { 18, 1 }, { 21, 0 }, { 17, 2 }, { 16, 3 },
+  { 19, 1 }, { 22, 0 }, { 18, 2 }, { 17, 3 },
+  { 20, 1 }, { 16, 4 }, { 23, 0 }, { 19, 2 },
+  { 24, 0 }, { 16, 5 }, { 21, 1 }, { 17, 4 },
+  { 18, 3 }, { 20, 2 }, { 17, 5 }, { 16, 6 },
+  { 19, 3 }, { 18, 4 }, { 25, 0 }, { 22, 1 },
+  { 16, 7 }, { 21, 2 }, { 17, 6 }, { 20, 3 },
+  { 26, 0 }, { 18, 5 }, { 19, 4 }, { 17, 7 },
+  { 23, 1 }, { 22, 2 }, { 18, 6 }, { 27, 0 },
+  { 19, 5 }, { 24, 1 }, { 21, 3 }, { 28, 0 },
+  { 20, 4 }, { 18, 7 }, { 19, 6 }, { 23, 2 },
+  { 29, 0 }, { 25, 1 }, { 21, 4 }, { 30, 0 },
+  { 20, 5 }, { 22, 3 }, { 31, 0 }, { 19, 7 },
+  { 24, 2 }, { 26, 1 }, { 20, 6 }, { 21, 5 },
+  { 22, 4 }, { 23, 3 }, { 27, 1 }, { 25, 2 },
+  { 20, 7 }, { 28, 1 }, { 24, 3 }, { 21, 6 },
+  { 22, 5 }, { 23, 4 }, { 26, 2 }, { 21, 7 },
+  { 29, 1 }, { 25, 3 }, { 30, 1 }, { 27, 2 },
+  { 22, 6 }, { 23, 5 }, { 31, 1 }, { 24, 4 },
+  { 26, 3 }, { 28, 2 }, { 22, 7 }, { 23, 6 },
+  { 25, 4 }, { 24, 5 }, { 29, 2 }, { 30, 2 },
+  { 27, 3 }, { 23, 7 }, { 31, 2 }, { 24, 6 },
+  { 26, 4 }, { 25, 5 }, { 28, 3 }, { 24, 7 },
+  { 27, 4 }, { 29, 3 }, { 25, 6 }, { 26, 5 },
+  { 30, 3 }, { 31, 3 }, { 28, 4 }, { 27, 5 },
+  { 25, 7 }, { 29, 4 }, { 26, 6 }, { 28, 5 },
+  { 30, 4 }, { 26, 7 }, { 27, 6 }, { 31, 4 },
+  { 29, 5 }, { 27, 7 }, { 30, 5 }, { 28, 6 },
+  { 31, 5 }, { 29, 6 }, { 28, 7 }, { 30, 6 },
+  { 31, 6 }, { 29, 7 }, { 30, 7 }, { 31, 7 },
+  { 0, 16 }, { 0, 17 }, { 1, 16 }, { 0, 18 },
+  { 1, 17 }, { 0, 19 }, { 2, 16 }, { 1, 18 },
+  { 0, 20 }, { 2, 17 }, { 3, 16 }, { 1, 19 },
+  { 2, 18 }, { 0, 21 }, { 3, 17 }, { 4, 16 },
+  { 1, 20 }, { 2, 19 }, { 0, 22 }, { 3, 18 },
+  { 4, 17 }, { 5, 16 }, { 0, 23 }, { 3, 19 },
+  { 2, 20 }, { 1, 21 }, { 4, 18 }, { 6, 16 },
+  { 5, 17 }, { 3, 20 }, { 2, 21 }, { 1, 22 },
+  { 0, 24 }, { 0, 25 }, { 4, 19 }, { 7, 16 },
+  { 6, 17 }, { 5, 18 }, { 0, 26 }, { 3, 21 },
+  { 2, 22 }, { 1, 23 }, { 4, 20 }, { 5, 19 },
+  { 6, 18 }, { 1, 24 }, { 7, 17 }, { 0, 27 },
+  { 2, 23 }, { 3, 22 }, { 4, 21 }, { 1, 25 },
+  { 5, 20 }, { 7, 18 }, { 0, 28 }, { 6, 19 },
+  { 2, 24 }, { 1, 26 }, { 0, 29 }, { 4, 22 },
+  { 3, 23 }, { 2, 25 }, { 5, 21 }, { 0, 31 },
+  { 7, 19 }, { 6, 20 }, { 0, 30 }, { 1, 27 },
+  { 3, 24 }, { 2, 26 }, { 4, 23 }, { 5, 22 },
+  { 7, 20 }, { 1, 28 }, { 6, 21 }, { 3, 25 },
+  { 2, 27 }, { 1, 29 }, { 4, 24 }, { 2, 28 },
+  { 1, 30 }, { 7, 21 }, { 5, 23 }, { 3, 26 },
+  { 6, 22 }, { 1, 31 }, { 4, 25 }, { 7, 22 },
+  { 3, 27 }, { 2, 29 }, { 2, 30 }, { 5, 24 },
+  { 2, 31 }, { 6, 23 }, { 4, 26 }, { 3, 28 },
+  { 5, 25 }, { 3, 29 }, { 6, 24 }, { 7, 23 },
+  { 3, 30 }, { 4, 27 }, { 3, 31 }, { 5, 26 },
+  { 6, 25 }, { 4, 28 }, { 7, 24 }, { 4, 29 },
+  { 5, 27 }, { 4, 30 }, { 4, 31 }, { 6, 26 },
+  { 5, 28 }, { 7, 25 }, { 6, 27 }, { 5, 29 },
+  { 7, 26 }, { 5, 30 }, { 5, 31 }, { 6, 28 },
+  { 7, 27 }, { 6, 29 }, { 6, 30 }, { 7, 28 },
+  { 6, 31 }, { 7, 29 }, { 7, 30 }, { 7, 31 },
+  { 8, 16 }, { 9, 16 }, { 8, 17 }, { 10, 16 },
+  { 9, 17 }, { 16, 8 }, { 8, 18 }, { 16, 9 },
+  { 10, 17 }, { 11, 16 }, { 17, 8 }, { 9, 18 },
+  { 8, 19 }, { 16, 10 }, { 11, 17 }, { 12, 16 },
+  { 10, 18 }, { 17, 9 }, { 9, 19 }, { 16, 11 },
+  { 8, 20 }, { 18, 8 }, { 17, 10 }, { 10, 19 },
+  { 12, 17 }, { 11, 18 }, { 9, 20 }, { 16, 12 },
+  { 18, 9 }, { 8, 21 }, { 13, 16 }, { 17, 11 },
+  { 19, 8 }, { 18, 10 }, { 13, 17 }, { 16, 13 },
+  { 11, 19 }, { 12, 18 }, { 10, 20 }, { 17, 12 },
+  { 9, 21 }, { 19, 9 }, { 8, 22 }, { 14, 16 },
+  { 18, 11 }, { 11, 20 }, { 10, 21 }, { 20, 8 },
+  { 13, 18 }, { 16, 14 }, { 12, 19 }, { 17, 13 },
+  { 19, 10 }, { 14, 17 }, { 9, 22 }, { 18, 12 },
+  { 8, 23 }, { 17, 14 }, { 20, 9 }, { 15, 16 },
+  { 16, 15 }, { 13, 19 }, { 10, 22 }, { 19, 11 },
+  { 11, 21 }, { 14, 18 }, { 12, 20 }, { 18, 13 },
+  { 20, 10 }, { 21, 8 }, { 15, 17 }, { 9, 23 },
+  { 19, 12 }, { 11, 22 }, { 8, 24 }, { 21, 9 },
+  { 17, 15 }, { 16, 16 }, { 14, 19 }, { 18, 14 },
+  { 12, 21 }, { 13, 20 }, { 20, 11 }, { 10, 23 },
+  { 19, 13 }, { 15, 18 }, { 16, 17 }, { 21, 10 },
+  { 22, 8 }, { 9, 24 }, { 8, 25 }, { 20, 12 },
+  { 15, 19 }, { 11, 23 }, { 17, 16 }, { 18, 15 },
+  { 14, 20 }, { 12, 22 }, { 10, 24 }, { 22, 9 },
+  { 21, 11 }, { 19, 14 }, { 13, 21 }, { 16, 18 },
+  { 9, 25 }, { 17, 17 }, { 8, 26 }, { 20, 13 },
+  { 23, 8 }, { 12, 23 }, { 13, 22 }, { 22, 10 },
+  { 19, 15 }, { 15, 20 }, { 16, 19 }, { 21, 12 },
+  { 11, 24 }, { 14, 21 }, { 8, 27 }, { 18, 16 },
+  { 10, 25 }, { 9, 26 }, { 22, 11 }, { 20, 14 },
+  { 23, 9 }, { 18, 17 }, { 17, 18 }, { 17, 19 },
+  { 19, 16 }, { 21, 13 }, { 10, 26 }, { 12, 24 },
+  { 23, 10 }, { 24, 8 }, { 8, 28 }, { 16, 20 },
+  { 9, 27 }, { 15, 21 }, { 22, 12 }, { 14, 22 },
+  { 13, 23 }, { 20, 15 }, { 11, 25 }, { 24, 9 },
+  { 18, 18 }, { 19, 17 }, { 23, 11 }, { 10, 27 },
+  { 8, 29 }, { 12, 25 }, { 9, 28 }, { 8, 30 },
+  { 21, 14 }, { 13, 24 }, { 11, 26 }, { 25, 8 },
+  { 24, 10 }, { 20, 16 }, { 19, 18 }, { 14, 23 },
+  { 22, 13 }, { 8, 31 }, { 17, 20 }, { 9, 29 },
+  { 23, 12 }, { 15, 22 }, { 25, 9 }, { 11, 27 },
+  { 10, 28 }, { 20, 17 }, { 21, 15 }, { 18, 19 },
+  { 16, 21 }, { 24, 11 }, { 9, 30 }, { 12, 26 },
+  { 10, 29 }, { 22, 14 }, { 14, 24 }, { 9, 31 },
+  { 26, 8 }, { 13, 25 }, { 25, 10 }, { 18, 20 },
+  { 19, 19 }, { 11, 28 }, { 15, 23 }, { 20, 18 },
+  { 10, 30 }, { 12, 27 }, { 17, 21 }, { 23, 13 },
+  { 24, 12 }, { 21, 16 }, { 16, 22 }, { 26, 9 },
+  { 27, 8 }, { 13, 26 }, { 22, 15 }, { 10, 31 },
+  { 14, 25 }, { 12, 28 }, { 25, 11 }, { 21, 17 },
+  { 26, 10 }, { 20, 19 }, { 11, 29 }, { 15, 24 },
+  { 23, 14 }, { 27, 9 }, { 11, 30 }, { 13, 27 },
+  { 19, 20 }, { 24, 13 }, { 28, 8 }, { 11, 31 },
+  { 22, 16 }, { 17, 22 }, { 16, 23 }, { 25, 12 },
+  { 18, 21 }, { 12, 29 }, { 21, 18 }, { 28, 9 },
+  { 27, 10 }, { 26, 11 }, { 29, 8 }, { 14, 26 },
+  { 15, 25 }, { 13, 28 }, { 12, 30 }, { 23, 15 },
+  { 30, 8 }, { 16, 24 }, { 13, 29 }, { 25, 13 },
+  { 24, 14 }, { 20, 20 }, { 31, 8 }, { 12, 31 },
+  { 14, 27 }, { 28, 10 }, { 26, 12 }, { 22, 17 },
+  { 21, 19 }, { 17, 23 }, { 18, 22 }, { 29, 9 },
+  { 27, 11 }, { 19, 21 }, { 27, 12 }, { 30, 9 },
+  { 31, 9 }, { 13, 30 }, { 24, 15 }, { 23, 16 },
+  { 15, 26 }, { 14, 28 }, { 29, 10 }, { 28, 11 },
+  { 26, 13 }, { 17, 24 }, { 13, 31 }, { 25, 14 },
+  { 22, 18 }, { 16, 25 }, { 30, 10 }, { 14, 29 },
+  { 15, 27 }, { 19, 22 }, { 21, 20 }, { 20, 21 },
+  { 27, 13 }, { 29, 11 }, { 18, 23 }, { 23, 17 },
+  { 16, 26 }, { 31, 10 }, { 24, 16 }, { 14, 30 },
+  { 22, 19 }, { 14, 31 }, { 28, 12 }, { 26, 14 },
+  { 30, 11 }, { 15, 28 }, { 25, 15 }, { 17, 25 },
+  { 23, 18 }, { 18, 24 }, { 15, 30 }, { 29, 12 },
+  { 31, 11 }, { 16, 27 }, { 24, 17 }, { 28, 13 },
+  { 19, 23 }, { 15, 29 }, { 25, 16 }, { 17, 26 },
+  { 27, 14 }, { 22, 20 }, { 15, 31 }, { 20, 22 },
+  { 21, 21 }, { 16, 28 }, { 17, 27 }, { 30, 12 },
+  { 26, 15 }, { 19, 24 }, { 18, 25 }, { 23, 19 },
+  { 29, 13 }, { 31, 12 }, { 24, 18 }, { 26, 16 },
+  { 25, 17 }, { 16, 29 }, { 28, 14 }, { 20, 23 },
+  { 18, 26 }, { 21, 22 }, { 19, 25 }, { 22, 21 },
+  { 27, 15 }, { 17, 28 }, { 16, 30 }, { 26, 17 },
+  { 23, 20 }, { 16, 31 }, { 25, 18 }, { 27, 16 },
+  { 20, 24 }, { 24, 19 }, { 31, 13 }, { 30, 13 },
+  { 29, 14 }, { 18, 27 }, { 28, 15 }, { 17, 29 },
+  { 19, 26 }, { 17, 30 }, { 21, 23 }, { 22, 22 },
+  { 30, 14 }, { 20, 25 }, { 23, 21 }, { 17, 31 },
+  { 18, 28 }, { 25, 19 }, { 24, 20 }, { 28, 16 },
+  { 31, 14 }, { 26, 18 }, { 19, 27 }, { 29, 15 },
+  { 27, 17 }, { 30, 15 }, { 21, 24 }, { 22, 23 },
+  { 26, 19 }, { 23, 22 }, { 28, 17 }, { 29, 16 },
+  { 18, 30 }, { 24, 21 }, { 25, 20 }, { 18, 31 },
+  { 18, 29 }, { 20, 26 }, { 19, 28 }, { 27, 18 },
+  { 31, 15 }, { 20, 27 }, { 30, 16 }, { 19, 29 },
+  { 29, 17 }, { 31, 16 }, { 27, 19 }, { 21, 25 },
+  { 28, 18 }, { 26, 20 }, { 22, 24 }, { 25, 21 },
+  { 19, 30 }, { 24, 22 }, { 30, 17 }, { 21, 26 },
+  { 23, 23 }, { 19, 31 }, { 20, 28 }, { 31, 17 },
+  { 28, 19 }, { 27, 20 }, { 21, 27 }, { 29, 18 },
+  { 30, 18 }, { 25, 22 }, { 26, 21 }, { 20, 29 },
+  { 22, 25 }, { 24, 23 }, { 29, 19 }, { 23, 24 },
+  { 20, 31 }, { 20, 30 }, { 28, 20 }, { 21, 28 },
+  { 22, 26 }, { 31, 18 }, { 27, 21 }, { 30, 19 },
+  { 22, 27 }, { 29, 20 }, { 23, 25 }, { 24, 24 },
+  { 26, 22 }, { 21, 29 }, { 25, 23 }, { 31, 19 },
+  { 21, 30 }, { 23, 26 }, { 28, 21 }, { 21, 31 },
+  { 22, 28 }, { 30, 20 }, { 25, 24 }, { 27, 22 },
+  { 29, 21 }, { 26, 23 }, { 24, 25 }, { 31, 20 },
+  { 23, 27 }, { 22, 29 }, { 30, 21 }, { 28, 22 },
+  { 24, 26 }, { 25, 25 }, { 27, 23 }, { 22, 30 },
+  { 23, 28 }, { 22, 31 }, { 26, 24 }, { 31, 21 },
+  { 24, 27 }, { 29, 22 }, { 27, 24 }, { 30, 22 },
+  { 25, 26 }, { 28, 23 }, { 23, 30 }, { 23, 29 },
+  { 24, 28 }, { 25, 27 }, { 31, 22 }, { 23, 31 },
+  { 26, 25 }, { 28, 24 }, { 29, 23 }, { 24, 29 },
+  { 24, 30 }, { 27, 25 }, { 25, 28 }, { 26, 26 },
+  { 30, 23 }, { 26, 27 }, { 31, 23 }, { 28, 25 },
+  { 27, 26 }, { 25, 29 }, { 24, 31 }, { 29, 24 },
+  { 30, 24 }, { 27, 27 }, { 29, 25 }, { 26, 28 },
+  { 31, 24 }, { 25, 30 }, { 25, 31 }, { 28, 26 },
+  { 27, 28 }, { 26, 29 }, { 30, 25 }, { 29, 26 },
+  { 28, 27 }, { 26, 30 }, { 31, 25 }, { 27, 29 },
+  { 26, 31 }, { 30, 26 }, { 28, 28 }, { 31, 26 },
+  { 29, 27 }, { 27, 30 }, { 28, 29 }, { 27, 31 },
+  { 30, 27 }, { 31, 27 }, { 28, 30 }, { 29, 28 },
+  { 30, 28 }, { 29, 29 }, { 30, 29 }, { 31, 28 },
+  { 28, 31 }, { 29, 30 }, { 29, 31 }, { 31, 29 },
+  { 30, 30 }, { 30, 31 }, { 31, 30 }, { 31, 31 }
diff --git a/av1/common/zigzag4.c b/av1/common/zigzag4.c
new file mode 100644
index 0000000..7ccc160
--- /dev/null
+++ b/av1/common/zigzag4.c
@@ -0,0 +1,28 @@
+/* This file is generated by gen_zigzag4.m */
+/* clang-format off */
+#include "odintrin.h"
+OD_EXTERN const unsigned char OD_ZIGZAG4_DCT_DCT[15][2] = {
+  {0, 1}, {1, 0}, {1, 1}, {0, 2},
+  {2, 0}, {0, 3}, {1, 2}, {3, 0},
+  {2, 1}, {1, 3}, {2, 2}, {3, 1},
+  {2, 3}, {3, 2}, {3, 3} };
+OD_EXTERN const unsigned char OD_ZIGZAG4_ADST_DCT[15][2] = {
+  {1, 0}, {0, 1}, {2, 0}, {1, 1},
+  {3, 0}, {2, 1}, {0, 2}, {1, 2},
+  {3, 1}, {0, 3}, {2, 2}, {1, 3},
+  {3, 2}, {2, 3}, {3, 3} };
+OD_EXTERN const unsigned char OD_ZIGZAG4_DCT_ADST[15][2] = {
+  {0, 1}, {0, 2}, {1, 0}, {0, 3},
+  {1, 1}, {1, 2}, {2, 0}, {1, 3},
+  {2, 1}, {2, 2}, {3, 0}, {3, 1},
+  {2, 3}, {3, 2}, {3, 3} };
+OD_EXTERN const unsigned char OD_ZIGZAG4_ADST_ADST[15][2] = {
+  {0, 1}, {1, 0}, {1, 1}, {0, 2},
+  {2, 0}, {0, 3}, {1, 2}, {3, 0},
+  {2, 1}, {1, 3}, {2, 2}, {3, 1},
+  {2, 3}, {3, 2}, {3, 3} };
diff --git a/av1/common/zigzag8.c b/av1/common/zigzag8.c
new file mode 100644
index 0000000..ba39ac0
--- /dev/null
+++ b/av1/common/zigzag8.c
@@ -0,0 +1,65 @@
+/* This file is generated by gen_zigzag8.m */
+/* clang-format off */
+#include "odintrin.h"
+OD_EXTERN const unsigned char OD_ZIGZAG8_DCT_DCT[48][2] = {
+  {4, 0}, {4, 1}, {5, 0}, {5, 1},
+  {6, 0}, {7, 0}, {6, 1}, {7, 1},
+  {0, 4}, {1, 4}, {0, 5}, {1, 5},
+  {0, 6}, {1, 6}, {0, 7}, {1, 7},
+  {2, 4}, {4, 2}, {3, 4}, {2, 5},
+  {4, 3}, {5, 2}, {4, 4}, {3, 5},
+  {5, 3}, {2, 6}, {4, 5}, {6, 2},
+  {5, 4}, {3, 6}, {2, 7}, {6, 3},
+  {5, 5}, {7, 2}, {4, 6}, {3, 7},
+  {6, 4}, {7, 3}, {4, 7}, {5, 6},
+  {6, 5}, {7, 4}, {5, 7}, {6, 6},
+  {7, 5}, {6, 7}, {7, 6}, {7, 7}
+  };
+OD_EXTERN const unsigned char OD_ZIGZAG8_ADST_DCT[48][2] = {
+  {4, 0}, {5, 0}, {4, 1}, {6, 0},
+  {5, 1}, {7, 0}, {6, 1}, {7, 1},
+  {0, 4}, {1, 4}, {0, 5}, {1, 5},
+  {0, 6}, {1, 6}, {0, 7}, {1, 7},
+  {4, 2}, {2, 4}, {5, 2}, {4, 3},
+  {3, 4}, {2, 5}, {5, 3}, {4, 4},
+  {6, 2}, {3, 5}, {5, 4}, {2, 6},
+  {4, 5}, {6, 3}, {7, 2}, {3, 6},
+  {2, 7}, {5, 5}, {6, 4}, {4, 6},
+  {7, 3}, {3, 7}, {5, 6}, {6, 5},
+  {4, 7}, {7, 4}, {5, 7}, {7, 5},
+  {6, 6}, {7, 6}, {6, 7}, {7, 7}
+  };
+OD_EXTERN const unsigned char OD_ZIGZAG8_DCT_ADST[48][2] = {
+  {4, 0}, {4, 1}, {5, 0}, {5, 1},
+  {6, 0}, {6, 1}, {7, 0}, {7, 1},
+  {0, 4}, {0, 5}, {1, 4}, {0, 6},
+  {1, 5}, {0, 7}, {1, 6}, {1, 7},
+  {2, 4}, {2, 5}, {3, 4}, {4, 2},
+  {2, 6}, {4, 3}, {3, 5}, {4, 4},
+  {2, 7}, {3, 6}, {5, 2}, {4, 5},
+  {5, 3}, {3, 7}, {5, 4}, {4, 6},
+  {6, 2}, {5, 5}, {4, 7}, {6, 3},
+  {6, 4}, {5, 6}, {7, 2}, {6, 5},
+  {7, 3}, {5, 7}, {7, 4}, {6, 6},
+  {7, 5}, {6, 7}, {7, 6}, {7, 7}
+  };
+OD_EXTERN const unsigned char OD_ZIGZAG8_ADST_ADST[48][2] = {
+  {4, 0}, {4, 1}, {5, 0}, {5, 1},
+  {6, 0}, {7, 0}, {6, 1}, {7, 1},
+  {0, 4}, {1, 4}, {0, 5}, {1, 5},
+  {0, 6}, {1, 6}, {0, 7}, {1, 7},
+  {2, 4}, {4, 2}, {3, 4}, {2, 5},
+  {4, 3}, {5, 2}, {4, 4}, {3, 5},
+  {5, 3}, {2, 6}, {4, 5}, {6, 2},
+  {5, 4}, {3, 6}, {2, 7}, {6, 3},
+  {5, 5}, {7, 2}, {4, 6}, {3, 7},
+  {6, 4}, {7, 3}, {4, 7}, {5, 6},
+  {6, 5}, {7, 4}, {5, 7}, {6, 6},
+  {7, 5}, {6, 7}, {7, 6}, {7, 7}
+  };
diff --git a/av1/decoder/decint.h b/av1/decoder/decint.h
new file mode 100644
index 0000000..99dbc43
--- /dev/null
+++ b/av1/decoder/decint.h
@@ -0,0 +1,33 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+/* clang-format off */
+#if !defined(_decint_H)
+# define _decint_H (1)
+# include "av1/common/pvq_state.h"
+# include "aom_dsp/entdec.h"
+typedef struct daala_dec_ctx daala_dec_ctx;
+typedef struct daala_dec_ctx od_dec_ctx;
+struct daala_dec_ctx {
+  /* Stores context-adaptive CDFs for PVQ. */
+  od_state state;
+  /* Daala entropy decoder. */
+  od_ec_dec *ec;
+  /* Mode of quantization matrice : FLAT (0) or HVS (1) */
+  int qm;
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index d1d0ae7..1af7a1a 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -60,6 +60,16 @@
 #define MAX_AV1_HEADER_SIZE 80
 #define ACCT_STR __func__
+#include "av1/decoder/pvq_decoder.h"
+#include "av1/encoder/encodemb.h"
+#include "aom_dsp/entdec.h"
+#include "av1/common/partition.h"
+#include "av1/decoder/decint.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
 static struct aom_read_bit_buffer *init_read_bit_buffer(
     AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
     const uint8_t *data_end, uint8_t clear_data[MAX_AV1_HEADER_SIZE]);
@@ -327,6 +337,141 @@
+static int av1_pvq_decode_helper(od_dec_ctx *dec, int16_t *ref_coeff,
+                                 int16_t *dqcoeff, int16_t *quant, int pli,
+                                 int bs, TX_TYPE tx_type, int xdec,
+                                 int ac_dc_coded) {
+  unsigned int flags;  // used for daala's stream analyzer.
+  int off;
+  const int is_keyframe = 0;
+  const int has_dc_skip = 1;
+  int quant_shift = bs == TX_32X32 ? 1 : 0;
+  // DC quantizer for PVQ
+  int pvq_dc_quant;
+  int lossless = (quant[0] == 0);
+  const int blk_size = tx_size_1d[bs];
+  int eob = 0;
+  int i;
+  // TODO(yushin) : To enable activity masking,
+  // int use_activity_masking = dec->use_activity_masking;
+  int use_activity_masking = 0;
+  DECLARE_ALIGNED(16, int16_t, dqcoeff_pvq[OD_BSIZE_MAX * OD_BSIZE_MAX]);
+  DECLARE_ALIGNED(16, int16_t, ref_coeff_pvq[OD_BSIZE_MAX * OD_BSIZE_MAX]);
+  od_coeff ref_int32[OD_BSIZE_MAX * OD_BSIZE_MAX];
+  od_coeff out_int32[OD_BSIZE_MAX * OD_BSIZE_MAX];
+  od_raster_to_coding_order(ref_coeff_pvq, blk_size, tx_type, ref_coeff,
+                            blk_size);
+  if (lossless)
+    pvq_dc_quant = 1;
+  else {
+    // TODO(yushin): Enable this for activity masking,
+    // when pvq_qm_q4 is available in AOM.
+    // pvq_dc_quant = OD_MAXI(1, quant*
+    // dec->state.pvq_qm_q4[pli][od_qm_get_index(bs, 0)] >> 4);
+    pvq_dc_quant = OD_MAXI(1, quant[0] >> quant_shift);
+  }
+  off = od_qm_offset(bs, xdec);
+  // copy int16 inputs to int32
+  for (i = 0; i < blk_size * blk_size; i++) ref_int32[i] = ref_coeff_pvq[i];
+  od_pvq_decode(dec, ref_int32, out_int32, (int)quant[1] >> quant_shift, pli,
+                bs, OD_PVQ_BETA[use_activity_masking][pli][bs],
+                OD_ROBUST_STREAM,
+                is_keyframe, &flags, ac_dc_coded, dec->state.qm + off,
+                dec->state.qm_inv + off);
+  // copy int32 result back to int16
+  for (i = 0; i < blk_size * blk_size; i++) dqcoeff_pvq[i] = out_int32[i];
+  if (!has_dc_skip || dqcoeff_pvq[0]) {
+    dqcoeff_pvq[0] =
+        has_dc_skip + generic_decode(dec->ec, &dec->state.adapt.model_dc[pli],
+                                     -1, &dec->state.adapt.ex_dc[pli][bs][0], 2,
+                                     "dc:mag");
+    if (dqcoeff_pvq[0])
+      dqcoeff_pvq[0] *= od_ec_dec_bits(dec->ec, 1, "dc:sign") ? -1 : 1;
+  }
+  dqcoeff_pvq[0] = dqcoeff_pvq[0] * pvq_dc_quant + ref_coeff_pvq[0];
+  od_coding_order_to_raster(dqcoeff, blk_size, tx_type, dqcoeff_pvq, blk_size);
+  eob = blk_size * blk_size;
+  return eob;
+static int av1_pvq_decode_helper2(
+    MACROBLOCKD *const xd, MB_MODE_INFO *const mbmi, int plane, int row,
+    int col, TX_SIZE tx_size, TX_TYPE tx_type ) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  // transform block size in pixels
+  int tx_blk_size = tx_size_1d[tx_size];
+  int i, j;
+  tran_low_t *pvq_ref_coeff = pd->pvq_ref_coeff;
+  const int diff_stride = tx_blk_size;
+  int16_t *pred = pd->pred;
+  tran_low_t *const dqcoeff = pd->dqcoeff;
+  int ac_dc_coded;  // bit0: DC coded, bit1 : AC coded
+  uint8_t *dst;
+  int eob;
+  eob = 0;
+  dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
+  // decode ac/dc coded flag. bit0: DC coded, bit1 : AC coded
+  // NOTE : we don't use 5 symbols for luma here in aom codebase,
+  // since block partition is taken care of by aom.
+  // So, only AC/DC skip info is coded
+  ac_dc_coded = od_decode_cdf_adapt(
+      xd->,
+      xd->daala_dec.state.adapt.skip_cdf[2 * tx_size + (plane != 0)], 4,
+      xd->daala_dec.state.adapt.skip_increment, "skip");
+  if (ac_dc_coded) {
+    int xdec = pd->subsampling_x;
+    int seg_id = mbmi->segment_id;
+    int16_t *quant;
+    FWD_TXFM_PARAM fwd_txfm_param;
+    for (j = 0; j < tx_blk_size; j++)
+      for (i = 0; i < tx_blk_size; i++) {
+        pred[diff_stride * j + i] = dst[pd->dst.stride * j + i];
+      }
+    fwd_txfm_param.tx_type = tx_type;
+    fwd_txfm_param.tx_size = tx_size;
+    fwd_txfm_param.fwd_txfm_opt = FWD_TXFM_OPT_NORMAL;
+    fwd_txfm_param.rd_transform = 0;
+    fwd_txfm_param.lossless = xd->lossless[seg_id];
+    fwd_txfm(pred, pvq_ref_coeff, diff_stride, &fwd_txfm_param);
+    quant = &pd->seg_dequant[seg_id][0];  // aom's quantizer
+    eob = av1_pvq_decode_helper(&xd->daala_dec, pvq_ref_coeff, dqcoeff, quant,
+                                plane, tx_size, tx_type, xdec, ac_dc_coded);
+    // Since av1 does not have separate inverse transform
+    // but also contains adding to predicted image,
+    // pass blank dummy image to av1_inv_txfm_add_*x*(), i.e. set dst as zeros
+    for (j = 0; j < tx_blk_size; j++)
+      for (i = 0; i < tx_blk_size; i++) dst[j * pd->dst.stride + i] = 0;
+    inverse_transform_block(xd, plane, tx_type, tx_size, dst,
+                            pd->dst.stride, eob);
+  }
+  return eob;
 static void predict_and_reconstruct_intra_block(
     AV1_COMMON *cm, MACROBLOCKD *const xd, aom_reader *r,
     MB_MODE_INFO *const mbmi, int plane, int row, int col, TX_SIZE tx_size) {
@@ -335,6 +480,10 @@
   PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
   uint8_t *dst;
   int block_idx = (row << 1) + col;
+  (void)cm;
+  (void)r;
   dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
   if (mbmi->sb_type < BLOCK_8X8)
@@ -345,6 +494,7 @@
   if (!mbmi->skip) {
     TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx);
     const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type);
     const int eob = av1_decode_block_tokens(xd, plane, scan_order, col, row,
                                             tx_size, r, mbmi->segment_id);
@@ -353,6 +503,9 @@
     inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
+    av1_pvq_decode_helper2(xd, mbmi, plane, row, col, tx_size, tx_type);
@@ -364,6 +517,13 @@
   PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
   int block_idx = (row << 1) + col;
   TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx);
+  int eob;
+  (void)cm;
+  (void)r;
   const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type);
   const int eob = av1_decode_block_tokens(xd, plane, scan_order, col, row,
                                           tx_size, r, mbmi->segment_id);
@@ -373,6 +533,9 @@
   inverse_transform_block(xd, plane, tx_type, tx_size,
                           &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
                           pd->dst.stride, eob);
+  eob = av1_pvq_decode_helper2(xd, mbmi, plane, row, col, tx_size, tx_type);
   return eob;
@@ -623,6 +786,11 @@
   partition =
       read_partition(cm, xd, mi_row, mi_col, r, has_rows, has_cols, n8x8_l2);
   subsize = subsize_lookup[partition][bsize];  // get_subsize(bsize, partition);
+  assert(partition < PARTITION_TYPES);
+  assert(subsize < BLOCK_SIZES);
   if (!hbs) {
     // calculate bmode block dimensions (log 2)
     xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
@@ -728,6 +896,7 @@
                        "Failed to allocate bool decoder %d", 1);
 static void read_coef_probs_common(av1_coeff_probs_model *coef_probs,
                                    aom_reader *r) {
   int i, j, k, l, m;
@@ -752,6 +921,7 @@
   for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
     read_coef_probs_common(fc->coef_probs[tx_size], r);
 static void setup_segmentation(AV1_COMMON *const cm,
                                struct aom_read_bit_buffer *rb) {
@@ -1262,6 +1432,18 @@
+static void daala_dec_init(daala_dec_ctx *daala_dec, od_ec_dec *ec) {
+  daala_dec->ec = ec;
+  od_adapt_ctx_reset(&daala_dec->state.adapt, 0);
+  daala_dec->qm = OD_FLAT_QM;
+  od_init_qm(daala_dec->state.qm, daala_dec->state.qm_inv,
+             daala_dec->qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT);
 static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
                                    const uint8_t *data_end) {
   AV1_COMMON *const cm = &pbi->common;
@@ -1331,6 +1513,9 @@
               ? &cm->counts
               : NULL;
+      av1_zero(tile_data->pvq_ref_coeff);
       av1_tile_init(&tile_data->xd.tile, tile_data->cm, tile_row, tile_col);
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                           &tile_data->bit_reader, pbi->decrypt_cb,
@@ -1342,7 +1527,14 @@
         tile_data->bit_reader.accounting = NULL;
-      av1_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
+      av1_init_macroblockd(cm, &tile_data->xd,
+                           tile_data->pvq_ref_coeff,
+                           tile_data->dqcoeff);
+      daala_dec_init(&tile_data->xd.daala_dec, &tile_data->;
       tile_data->xd.plane[0].color_index_map = tile_data->color_index_map[0];
       tile_data->xd.plane[1].color_index_map = tile_data->color_index_map[1];
@@ -1598,7 +1790,14 @@
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                           &tile_data->bit_reader, pbi->decrypt_cb,
-      av1_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
+      av1_init_macroblockd(cm, &tile_data->xd,
+                           tile_data->pvq_ref_coeff,
+                           tile_data->dqcoeff);
+      daala_dec_init(&tile_data->xd.daala_dec, &tile_data->;
       tile_data->xd.plane[0].color_index_map = tile_data->color_index_map[0];
       tile_data->xd.plane[1].color_index_map = tile_data->color_index_map[1];
@@ -2044,7 +2243,9 @@
   if (cm->tx_mode == TX_MODE_SELECT) read_tx_mode_probs(&fc->tx_probs, &r);
   read_coef_probs(fc, cm->tx_mode, &r);
   for (k = 0; k < SKIP_CONTEXTS; ++k)
     av1_diff_update_prob(&r, &fc->skip_probs[k], ACCT_STR);
diff --git a/av1/decoder/decoder.c b/av1/decoder/decoder.c
index fa2d061..1b12476 100644
--- a/av1/decoder/decoder.c
+++ b/av1/decoder/decoder.c
@@ -32,7 +32,10 @@
 #include "av1/decoder/decodeframe.h"
 #include "av1/decoder/decoder.h"
 #include "av1/decoder/detokenize.h"
 static void initialize_dec(void) {
   static volatile int init_done = 0;
diff --git a/av1/decoder/decoder.h b/av1/decoder/decoder.h
index d19909e..8d28a10 100644
--- a/av1/decoder/decoder.h
+++ b/av1/decoder/decoder.h
@@ -26,6 +26,12 @@
 #include "av1/common/accounting.h"
+#include "aom_dsp/entdec.h"
+#include "av1/decoder/decint.h"
+#include "av1/encoder/encodemb.h"
 #ifdef __cplusplus
 extern "C" {
@@ -37,6 +43,10 @@
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+  /* forward transformed predicted image, a reference for PVQ */
+  DECLARE_ALIGNED(16, tran_low_t, pvq_ref_coeff[OD_BSIZE_MAX * OD_BSIZE_MAX]);
   DECLARE_ALIGNED(16, uint8_t, color_index_map[2][64 * 64]);
 #endif  // CONFIG_PALETTE
@@ -49,6 +59,10 @@
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+  /* forward transformed predicted image, a reference for PVQ */
+  DECLARE_ALIGNED(16, tran_low_t, pvq_ref_coeff[OD_BSIZE_MAX * OD_BSIZE_MAX]);
   DECLARE_ALIGNED(16, uint8_t, color_index_map[2][64 * 64]);
 #endif  // CONFIG_PALETTE
diff --git a/av1/decoder/detokenize.c b/av1/decoder/detokenize.c
index 1de0048..68d87cb 100644
--- a/av1/decoder/detokenize.c
+++ b/av1/decoder/detokenize.c
@@ -9,9 +9,12 @@
  * PATENTS file, you can obtain it at
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 #include "av1/common/blockd.h"
 #include "av1/common/common.h"
 #include "av1/common/entropy.h"
@@ -319,3 +322,4 @@
   av1_set_contexts(xd, pd, tx_size, eob > 0, x, y);
   return eob;
diff --git a/av1/decoder/detokenize.h b/av1/decoder/detokenize.h
index c2868d6..569580c 100644
--- a/av1/decoder/detokenize.h
+++ b/av1/decoder/detokenize.h
@@ -9,6 +9,7 @@
  * PATENTS file, you can obtain it at
@@ -31,5 +32,5 @@
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/av1/decoder/generic_decoder.c b/av1/decoder/generic_decoder.c
new file mode 100644
index 0000000..86187fa
--- /dev/null
+++ b/av1/decoder/generic_decoder.c
@@ -0,0 +1,137 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+/* clang-format off */
+# include "config.h"
+#include <stdio.h>
+#include "aom_dsp/entdec.h"
+#include "av1/common/generic_code.h"
+#include "av1/common/odintrin.h"
+#include "pvq_decoder.h"
+/** Decodes a value from 0 to N-1 (with N up to 16) based on a cdf and adapts
+ * the cdf accordingly.
+ *
+ * @param [in,out] enc   range encoder
+ * @param [in,out] cdf   CDF of the variable (Q15)
+ * @param [in]     n     number of values possible
+ * @param [in,out] count number of symbols encoded with that cdf so far
+ * @param [in]     rate  adaptation rate shift (smaller is faster)
+ * @return decoded variable
+ */
+int od_decode_cdf_adapt_q15_(od_ec_dec *ec, uint16_t *cdf, int n,
+ int *count, int rate OD_ACC_STR) {
+  int val;
+  int i;
+  if (*count == 0) {
+    int ft;
+    ft = cdf[n - 1];
+    for (i = 0; i < n; i++) {
+      cdf[i] = cdf[i]*32768/ft;
+    }
+  }
+  val = od_ec_decode_cdf_q15(ec, cdf, n);
+  od_cdf_adapt_q15(val, cdf, n, count, rate);
+  return val;
+/** Decodes a value from 0 to N-1 (with N up to 16) based on a cdf and adapts
+ * the cdf accordingly.
+ *
+ * @param [in,out] enc   range encoder
+ * @param [in]     cdf   CDF of the variable (Q15)
+ * @param [in]     n     number of values possible
+ * @param [in]     increment adaptation speed (Q15)
+ *
+ * @retval decoded variable
+ */
+int od_decode_cdf_adapt_(od_ec_dec *ec, uint16_t *cdf, int n,
+ int increment OD_ACC_STR) {
+  int i;
+  int val;
+  val = od_ec_decode_cdf_unscaled(ec, cdf, n);
+  if (cdf[n-1] + increment > 32767) {
+    for (i = 0; i < n; i++) {
+      /* Second term ensures that the pdf is non-null */
+      cdf[i] = (cdf[i] >> 1) + i + 1;
+    }
+  }
+  for (i = val; i < n; i++) cdf[i] += increment;
+  return val;
+/** Encodes a random variable using a "generic" model, assuming that the
+ * distribution is one-sided (zero and up), has a single mode, and decays
+ * exponentially past the model.
+ *
+ * @param [in,out] dec   range decoder
+ * @param [in,out] model generic probability model
+ * @param [in]     x     variable being encoded
+ * @param [in,out] ExQ16 expectation of x (adapted)
+ * @param [in]     integration integration period of ExQ16 (leaky average over
+ * 1<<integration samples)
+ *
+ * @retval decoded variable x
+ */
+int generic_decode_(od_ec_dec *dec, generic_encoder *model, int max,
+ int *ex_q16, int integration OD_ACC_STR) {
+  int lg_q1;
+  int shift;
+  int id;
+  uint16_t *cdf;
+  int xs;
+  int lsb;
+  int x;
+  int ms;
+  lsb = 0;
+  if (max == 0) return 0;
+  lg_q1 = log_ex(*ex_q16);
+  /* If expectation is too large, shift x to ensure that
+     all we have past xs=15 is the exponentially decaying tail
+     of the distribution. */
+  shift = OD_MAXI(0, (lg_q1 - 5) >> 1);
+  /* Choose the cdf to use: we have two per "octave" of ExQ16. */
+  id = OD_MINI(GENERIC_TABLES - 1, lg_q1);
+  cdf = model->cdf[id];
+  ms = (max + (1 << shift >> 1)) >> shift;
+  if (max == -1) xs = od_ec_decode_cdf_unscaled(dec, cdf, 16);
+  else xs = od_ec_decode_cdf_unscaled(dec, cdf, OD_MINI(ms + 1, 16));
+  if (xs == 15) {
+    int e;
+    unsigned decay;
+    /* Estimate decay based on the assumption that the distribution is close
+       to Laplacian for large values. We should probably have an adaptive
+       estimate instead. Note: The 2* is a kludge that's not fully understood
+       yet. */
+    OD_ASSERT(*ex_q16 < INT_MAX >> 1);
+    e = ((2**ex_q16 >> 8) + (1 << shift >> 1)) >> shift;
+    decay = OD_MAXI(2, OD_MINI(254, 256*e/(e + 256)));
+    xs += laplace_decode_special(dec, decay, (max == -1) ? -1 : ms - 15, acc_str);
+  }
+  if (shift != 0) {
+    int special;
+    /* Because of the rounding, there's only half the number of possibilities
+       for xs=0 */
+    special = xs == 0;
+    if (shift - special > 0) lsb = od_ec_dec_bits(dec, shift - special, acc_str);
+    lsb -= !special << (shift - 1);
+  }
+  x = (xs << shift) + lsb;
+  generic_model_update(model, ex_q16, x, xs, id, integration);
+   "dec: %d %d %d %d %d %x", *ex_q16, x, shift, id, xs, dec->rng));
+  return x;
diff --git a/av1/decoder/laplace_decoder.c b/av1/decoder/laplace_decoder.c
new file mode 100644
index 0000000..4c3def5
--- /dev/null
+++ b/av1/decoder/laplace_decoder.c
@@ -0,0 +1,323 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+/* clang-format off */
+# include "config.h"
+#include <stdio.h>
+#include "aom_dsp/entdec.h"
+#include "av1/common/pvq.h"
+#include "pvq_decoder.h"
+# define od_decode_pvq_split(ec, adapt, sum, ctx, str) od_decode_pvq_split_(ec, adapt, sum, ctx, str)
+# define od_decode_pvq_split(ec, adapt, sum, ctx, str) od_decode_pvq_split_(ec, adapt, sum, ctx)
+static int od_decode_pvq_split_(od_ec_dec *ec, od_pvq_codeword_ctx *adapt,
+ int sum, int ctx OD_ACC_STR) {
+  int shift;
+  int count;
+  int msbs;
+  int fctx;
+  count = 0;
+  if (sum == 0) return 0;
+  shift = OD_MAXI(0, OD_ILOG(sum) - 3);
+  fctx = 7*ctx + (sum >> shift) - 1;
+  msbs = od_decode_cdf_adapt(ec, adapt->pvq_split_cdf[fctx],
+   (sum >> shift) + 1, adapt->pvq_split_increment, acc_str);
+  if (shift) count = od_ec_dec_bits(ec, shift, acc_str);
+  count += msbs << shift;
+  if (count > sum) {
+    count = sum;
+    ec->error = 1;
+  }
+  return count;
+void od_decode_band_pvq_splits(od_ec_dec *ec, od_pvq_codeword_ctx *adapt,
+ od_coeff *y, int n, int k, int level) {
+  int mid;
+  int count_right;
+  if (n == 1) {
+    y[0] = k;
+  }
+  else if (k == 0) {
+    OD_CLEAR(y, n);
+  }
+  else if (k == 1 && n <= 16) {
+    int cdf_id;
+    int pos;
+    cdf_id = od_pvq_k1_ctx(n, level == 0);
+    OD_CLEAR(y, n);
+    pos = od_decode_cdf_adapt(ec, adapt->pvq_k1_cdf[cdf_id], n,
+     adapt->pvq_k1_increment, "pvq:k1");
+    y[pos] = 1;
+  }
+  else {
+    mid = n >> 1;
+    count_right = od_decode_pvq_split(ec, adapt, k, od_pvq_size_ctx(n),
+     "pvq:split");
+    od_decode_band_pvq_splits(ec, adapt, y, mid, k - count_right, level + 1);
+    od_decode_band_pvq_splits(ec, adapt, y + mid, n - mid, count_right,
+     level + 1);
+  }
+/** Decodes the tail of a Laplace-distributed variable, i.e. it doesn't
+ * do anything special for the zero case.
+ *
+ * @param [dec] range decoder
+ * @param [decay] decay factor of the distribution, i.e. pdf ~= decay^x
+ * @param [max] maximum possible value of x (used to truncate the pdf)
+ *
+ * @retval decoded variable x
+ */
+int od_laplace_decode_special_(od_ec_dec *dec, unsigned decay, int max OD_ACC_STR) {
+  int pos;
+  int shift;
+  int xs;
+  int ms;
+  int sym;
+  const uint16_t *cdf;
+  shift = 0;
+  if (max == 0) return 0;
+  /* We don't want a large decay value because that would require too many
+     symbols. However, it's OK if the max is below 15. */
+  while (((max >> shift) >= 15 || max == -1) && decay > 235) {
+    decay = (decay*decay + 128) >> 8;
+    shift++;
+  }
+  decay = OD_MINI(decay, 254);
+  decay = OD_MAXI(decay, 2);
+  ms = max >> shift;
+  cdf = EXP_CDF_TABLE[(decay + 1) >> 1];
+  OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "decay = %d\n", decay));
+  xs = 0;
+  do {
+    sym = OD_MINI(xs, 15);
+    {
+      int i;
+      OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "%d %d %d %d", xs, shift, sym, max));
+      for (i = 0; i < 16; i++) {
+        OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "%d ", cdf[i]));
+      }
+    }
+    if (ms > 0 && ms < 15) {
+      /* Simple way of truncating the pdf when we have a bound. */
+      sym = od_ec_decode_cdf_unscaled(dec, cdf, ms + 1);
+    }
+    else sym = od_ec_decode_cdf_q15(dec, cdf, 16);
+    xs += sym;
+    ms -= 15;
+  }
+  while (sym >= 15 && ms != 0);
+  if (shift) pos = (xs << shift) + od_ec_dec_bits(dec, shift, acc_str);
+  else pos = xs;
+  OD_ASSERT(pos >> shift <= max >> shift || max == -1);
+  if (max != -1 && pos > max) {
+    pos = max;
+    dec->error = 1;
+  }
+  OD_ASSERT(pos <= max || max == -1);
+  return pos;
+/** Decodes a Laplace-distributed variable for use in PVQ.
+ *
+ * @param [in,out] dec  range decoder
+ * @param [in]     ExQ8 expectation of the absolute value of x
+ * @param [in]     K    maximum value of |x|
+ *
+ * @retval decoded variable (including sign)
+ */
+int od_laplace_decode_(od_ec_dec *dec, unsigned ex_q8, int k OD_ACC_STR) {
+  int j;
+  int shift;
+  uint16_t cdf[16];
+  int sym;
+  int lsb;
+  int decay;
+  int offset;
+  lsb = 0;
+  /* Shift down x if expectation is too high. */
+  shift = OD_ILOG(ex_q8) - 11;
+  if (shift < 0) shift = 0;
+  /* Apply the shift with rounding to Ex, K and xs. */
+  ex_q8 = (ex_q8 + (1 << shift >> 1)) >> shift;
+  k = (k + (1 << shift >> 1)) >> shift;
+  decay = OD_MINI(254, OD_DIVU(256*ex_q8, (ex_q8 + 256)));
+  offset = LAPLACE_OFFSET[(decay + 1) >> 1];
+  for (j = 0; j < 16; j++) {
+    cdf[j] = EXP_CDF_TABLE[(decay + 1) >> 1][j] - offset;
+  }
+  /* Simple way of truncating the pdf when we have a bound */
+  if (k == 0) sym = 0;
+  else sym = od_ec_decode_cdf_unscaled(dec, cdf, OD_MINI(k + 1, 16));
+  if (shift) {
+    int special;
+    /* Because of the rounding, there's only half the number of possibilities
+       for xs=0 */
+    special = (sym == 0);
+    if (shift - special > 0) lsb = od_ec_dec_bits(dec, shift - special, acc_str);
+    lsb -= (!special << (shift - 1));
+  }
+  /* Handle the exponentially-decaying tail of the distribution */
+  if (sym == 15) sym += laplace_decode_special(dec, decay, k - 15, acc_str);
+  return (sym << shift) + lsb;
+# define laplace_decode_vector_delta(dec, y, n, k, curr, means, str) laplace_decode_vector_delta_(dec, y, n, k, curr, means, str)
+# define laplace_decode_vector_delta(dec, y, n, k, curr, means, str) laplace_decode_vector_delta_(dec, y, n, k, curr, means)
+static void laplace_decode_vector_delta_(od_ec_dec *dec, od_coeff *y, int n, int k,
+                                        int32_t *curr, const int32_t *means
+                                        OD_ACC_STR) {
+  int i;
+  int prev;
+  int sum_ex;
+  int sum_c;
+  int coef;
+  int pos;
+  int k0;
+  int sign;
+  int first;
+  int k_left;
+  prev = 0;
+  sum_ex = 0;
+  sum_c = 0;
+  coef = 256*means[OD_ADAPT_COUNT_Q8]/
+   (1 + means[OD_ADAPT_COUNT_EX_Q8]);
+  pos = 0;
+  sign = 0;
+  first = 1;
+  k_left = k;
+  for (i = 0; i < n; i++) y[i] = 0;
+  k0 = k_left;
+  coef = OD_MAXI(coef, 1);
+  for (i = 0; i < k0; i++) {
+    int count;
+    if (first) {
+      int decay;
+      int ex = coef*(n - prev)/k_left;
+      if (ex > 65280) decay = 255;
+      else {
+        decay = OD_MINI(255,
+         (int)((256*ex/(ex + 256) + (ex>>5)*ex/((n + 1)*(n - 1)*(n - 1)))));
+      }
+      /*Update mean position.*/
+      count = laplace_decode_special(dec, decay, n - 1, acc_str);
+      first = 0;
+    }
+    else count = laplace_decode(dec, coef*(n - prev)/k_left, n - prev - 1, acc_str);
+    sum_ex += 256*(n - prev);
+    sum_c += count*k_left;
+    pos += count;
+    OD_ASSERT(pos < n);
+    if (y[pos] == 0)
+      sign = od_ec_dec_bits(dec, 1, acc_str);
+    y[pos] += sign ? -1 : 1;
+    prev = pos;
+    k_left--;
+    if (k_left == 0) break;
+  }
+  if (k > 0) {
+    curr[OD_ADAPT_COUNT_Q8] = 256*sum_c;
+    curr[OD_ADAPT_COUNT_EX_Q8] = sum_ex;
+  }
+  else {
+    curr[OD_ADAPT_COUNT_Q8] = -1;
+    curr[OD_ADAPT_COUNT_EX_Q8] = 0;
+  }
+  curr[OD_ADAPT_K_Q8] = 0;
+  curr[OD_ADAPT_SUM_EX_Q8] = 0;
+/** Decodes a vector of integers assumed to come from rounding a sequence of
+ * Laplace-distributed real values in decreasing order of variance.
+ *
+ * @param [in,out] dec range decoder
+ * @param [in]     y     decoded vector
+ * @param [in]     N     dimension of the vector
+ * @param [in]     K     sum of the absolute value of components of y
+ * @param [out]    curr  Adaptation context output, may alias means.
+ * @param [in]     means Adaptation context input.
+ */
+void od_laplace_decode_vector_(od_ec_dec *dec, od_coeff *y, int n, int k,
+                           int32_t *curr, const int32_t *means OD_ACC_STR) {
+  int i;
+  int sum_ex;
+  int kn;
+  int exp_q8;
+  int mean_k_q8;
+  int mean_sum_ex_q8;
+  int ran_delta;
+  ran_delta = 0;
+  if (k <= 1) {
+    laplace_decode_vector_delta(dec, y, n, k, curr, means, acc_str);
+    return;
+  }
+  if (k == 0) {
+    curr[OD_ADAPT_K_Q8] = 0;
+    curr[OD_ADAPT_SUM_EX_Q8] = 0;
+    for (i = 0; i < n; i++) y[i] = 0;
+    return;
+  }
+  sum_ex = 0;
+  kn = k;
+  /* Estimates the factor relating pulses_left and positions_left to E(|x|).*/
+  mean_k_q8 = means[OD_ADAPT_K_Q8];
+  mean_sum_ex_q8 = means[OD_ADAPT_SUM_EX_Q8];
+  if (mean_k_q8 < 1 << 23) exp_q8 = 256*mean_k_q8/(1 + mean_sum_ex_q8);
+  else exp_q8 = mean_k_q8/(1 + (mean_sum_ex_q8 >> 8));
+  for (i = 0; i < n; i++) {
+    int ex;
+    int x;
+    if (kn == 0) break;
+    if (kn <= 1 && i != n - 1) {
+      laplace_decode_vector_delta(dec, y + i, n - i, kn, curr, means, acc_str);
+      ran_delta = 1;
+      i = n;
+      break;
+    }
+    /* Expected value of x (round-to-nearest) is
+       expQ8*pulses_left/positions_left. */
+    ex = (2*exp_q8*kn + (n - i))/(2*(n - i));
+    if (ex > kn*256) ex = kn*256;
+    sum_ex += (2*256*kn + (n - i))/(2*(n - i));
+    /* No need to encode the magnitude for the last bin. */
+    if (i != n - 1) x = laplace_decode(dec, ex, kn, acc_str);
+    else x = kn;
+    if (x != 0) {
+      if (od_ec_dec_bits(dec, 1, acc_str)) x = -x;
+    }
+    y[i] = x;
+    kn -= abs(x);
+  }
+  /* Adapting the estimates for expQ8. */
+  if (!ran_delta) {
+  }
+  curr[OD_ADAPT_K_Q8] = k - kn;
+  curr[OD_ADAPT_SUM_EX_Q8] = sum_ex;
+  for (; i < n; i++) y[i] = 0;
diff --git a/av1/decoder/pvq_decoder.c b/av1/decoder/pvq_decoder.c
new file mode 100644
index 0000000..2340605
--- /dev/null
+++ b/av1/decoder/pvq_decoder.c
@@ -0,0 +1,371 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+/* clang-format off */
+# include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "./aom_config.h"
+#include "aom_dsp/entcode.h"
+#include "aom_dsp/entdec.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/partition.h"
+#include "av1/common/pvq_state.h"
+#include "av1/decoder/decint.h"
+#include "av1/decoder/pvq_decoder.h"
+static void od_decode_pvq_codeword(od_ec_dec *ec, od_pvq_codeword_ctx *ctx,
+ od_coeff *y, int n, int k) {
+  int i;
+  od_decode_band_pvq_splits(ec, ctx, y, n, k, 0);
+  for (i = 0; i < n; i++) {
+    if (y[i] && od_ec_dec_bits(ec, 1, "pvq:sign")) y[i] = -y[i];
+  }
+/** Inverse of neg_interleave; decodes the interleaved gain.
+ *
+ * @param [in]      x      quantized/interleaved gain to decode
+ * @param [in]      ref    quantized gain of the reference
+ * @return                 original quantized gain value
+ */
+static int neg_deinterleave(int x, int ref) {
+  if (x < 2*ref-1) {
+    if (x & 1) return ref - 1 - (x >> 1);
+    else return ref + (x >> 1);
+  }
+  else return x+1;
+/** Synthesizes one parition of coefficient values from a PVQ-encoded
+ * vector.
+ *
+ * @param [out]     xcoeff  output coefficient partition (x in math doc)
+ * @param [in]      ypulse  PVQ-encoded values (y in math doc); in the noref
+ *                          case, this vector has n entries, in the
+ *                          reference case it contains n-1 entries
+ *                          (the m-th entry is not included)
+ * @param [in]      ref     reference vector (prediction)
+ * @param [in]      n       number of elements in this partition
+ * @param [in]      gr      gain of the reference vector (prediction)
+ * @param [in]      noref   indicates presence or lack of prediction
+ * @param [in]      g       decoded quantized vector gain
+ * @param [in]      theta   decoded theta (prediction error)
+ * @param [in]      qm      QM with magnitude compensation
+ * @param [in]      qm_inv  Inverse of QM with magnitude compensation
+ */
+static void pvq_synthesis(od_coeff *xcoeff, od_coeff *ypulse, od_val16 *r16,
+ int n, od_val32 gr, int noref, od_val32 g, od_val32 theta, const int16_t *qm_inv,
+ int shift) {
+  int s;
+  int m;
+  /* Sign of the Householder reflection vector */
+  s = 0;
+  /* Direction of the Householder reflection vector */
+  m = noref ? 0 : od_compute_householder(r16, n, gr, &s, shift);
+  od_pvq_synthesis_partial(xcoeff, ypulse, r16, n, noref, g, theta, m, s,
+   qm_inv);
+typedef struct {
+  od_coeff *ref;
+  int nb_coeffs;
+  int allow_flip;
+} cfl_ctx;
+/** Decodes a single vector of integers (eg, a partition within a
+ *  coefficient block) encoded using PVQ
+ *
+ * @param [in,out] ec      range encoder
+ * @param [in]     q0      scale/quantizer
+ * @param [in]     n       number of coefficients in partition
+ * @param [in,out] model   entropy decoder state
+ * @param [in,out] adapt   adaptation context
+ * @param [in,out] exg     ExQ16 expectation of decoded gain value
+ * @param [in,out] ext     ExQ16 expectation of decoded theta value
+ * @param [in]     ref     'reference' (prediction) vector
+ * @param [out]    out     decoded partition
+ * @param [out]    noref   boolean indicating absence of reference
+ * @param [in]     beta    per-band activity masking beta param
+ * @param [in]     robust  stream is robust to error in the reference
+ * @param [in]     is_keyframe whether we're encoding a keyframe
+ * @param [in]     pli     plane index
+ * @param [in]     cdf_ctx selects which cdf context to use
+ * @param [in,out] skip_rest whether to skip further bands in each direction
+ * @param [in]     band    index of the band being decoded
+ * @param [in]     band    index of the band being decoded
+ * @param [out]    skip    skip flag with range [0,1]
+ * @param [in]     qm      QM with magnitude compensation
+ * @param [in]     qm_inv  Inverse of QM with magnitude compensation
+ */
+static void pvq_decode_partition(od_ec_dec *ec,
+                                 int q0,
+                                 int n,
+                                 generic_encoder model[3],
+                                 od_adapt_ctx *adapt,
+                                 int *exg,
+                                 int *ext,
+                                 od_coeff *ref,
+                                 od_coeff *out,
+                                 int *noref,
+                                 od_val16 beta,
+                                 int robust,
+                                 int is_keyframe,
+                                 int pli,
+                                 int cdf_ctx,
+                                 cfl_ctx *cfl,
+                                 int has_skip,
+                                 int *skip_rest,
+                                 int band,
+                                 int *skip,
+                                 const int16_t *qm,
+                                 const int16_t *qm_inv) {
+  int k;
+  od_val32 qcg;
+  int max_theta;
+  int itheta;
+  od_val32 theta;
+  od_val32 gr;
+  od_val32 gain_offset;
+  od_coeff y[MAXN];
+  int qg;
+  int nodesync;
+  int id;
+  int i;
+  od_val16 ref16[MAXN];
+  int rshift;
+  theta = 0;
+  gr = 0;
+  gain_offset = 0;
+  /* We always use the robust bitstream for keyframes to avoid having
+     PVQ and entropy decoding depending on each other, hurting parallelism. */
+  nodesync = robust || is_keyframe;
+  /* Skip is per-direction. For band=0, we can use any of the flags. */
+  if (skip_rest[(band + 2) % 3]) {
+    qg = 0;
+    if (is_keyframe) {
+      itheta = -1;
+      *noref = 1;
+    }
+    else {
+      itheta = 0;
+      *noref = 0;
+    }
+  }
+  else {
+    /* Jointly decode gain, itheta and noref for small values. Then we handle
+       larger gain. We need to wait for itheta because in the !nodesync case
+       it depends on max_theta, which depends on the gain. */
+    id = od_decode_cdf_adapt(ec, &adapt->pvq.pvq_gaintheta_cdf[cdf_ctx][0],
+     8 + 7*has_skip, adapt->pvq.pvq_gaintheta_increment,
+     "pvq:gaintheta");
+    if (!is_keyframe && id >= 10) id++;
+    if (is_keyframe && id >= 8) id++;
+    if (id >= 8) {
+      id -= 8;
+      skip_rest[0] = skip_rest[1] = skip_rest[2] = 1;
+    }
+    qg = id & 1;
+    itheta = (id >> 1) - 1;
+    *noref = (itheta == -1);
+  }
+  /* The CfL flip bit is only decoded on the first band that has noref=0. */
+  if (cfl->allow_flip && !*noref) {
+    int flip;
+    flip = od_ec_dec_bits(ec, 1, "cfl:flip");
+    if (flip) {
+      for (i = 0; i < cfl->nb_coeffs; i++) cfl->ref[i] = -cfl->ref[i];
+    }
+    cfl->allow_flip = 0;
+  }
+  if (qg > 0) {
+    int tmp;
+    tmp = *exg;
+    qg = 1 + generic_decode(ec, &model[!*noref], -1, &tmp, 2, "pvq:gain");
+    OD_IIR_DIADIC(*exg, qg << 16, 2);
+  }
+  *skip = 0;
+#if defined(OD_FLOAT_PVQ)
+  rshift = 0;
+  /* Shift needed to make the reference fit in 15 bits, so that the Householder
+     vector can fit in 16 bits. */
+  rshift = OD_MAXI(0, od_vector_log_mag(ref, n) - 14);
+  for (i = 0; i < n; i++) {
+#if defined(OD_FLOAT_PVQ)
+    ref16[i] = ref[i]*(double)qm[i]*OD_QM_SCALE_1;
+    ref16[i] = OD_SHR_ROUND(ref[i]*qm[i], OD_QM_SHIFT + rshift);
+  }
+  if(!*noref){
+    /* we have a reference; compute its gain */
+    od_val32 cgr;
+    int icgr;
+    int cfl_enabled;
+    cfl_enabled = pli != 0 && is_keyframe && !OD_DISABLE_CFL;
+    cgr = od_pvq_compute_gain(ref16, n, q0, &gr, beta, rshift);
+    if (cfl_enabled) cgr = OD_CGAIN_SCALE;
+#if defined(OD_FLOAT_PVQ)
+    icgr = (int)floor(.5 + cgr);
+    icgr = OD_SHR_ROUND(cgr, OD_CGAIN_SHIFT);
+    /* quantized gain is interleave encoded when there's a reference;
+       deinterleave it now */
+    if (is_keyframe) qg = neg_deinterleave(qg, icgr);
+    else {
+      qg = neg_deinterleave(qg, icgr + 1) - 1;
+      if (qg == 0) *skip = (icgr ? OD_PVQ_SKIP_ZERO : OD_PVQ_SKIP_COPY);
+    }
+    if (qg == icgr && itheta == 0 && !cfl_enabled) *skip = OD_PVQ_SKIP_COPY;
+    gain_offset = cgr - OD_SHL(icgr, OD_CGAIN_SHIFT);
+    qcg = OD_SHL(qg, OD_CGAIN_SHIFT) + gain_offset;
+    /* read and decode first-stage PVQ error theta */
+    max_theta = od_pvq_compute_max_theta(qcg, beta);
+    if (itheta > 1 && (nodesync || max_theta > 3)) {
+      int tmp;
+      tmp = *ext;
+      itheta = 2 + generic_decode(ec, &model[2], nodesync ? -1 : max_theta - 3,
+       &tmp, 2, "pvq:theta");
+      OD_IIR_DIADIC(*ext, itheta << 16, 2);
+    }
+    theta = od_pvq_compute_theta(itheta, max_theta);
+  }
+  else{
+    itheta = 0;
+    if (!is_keyframe) qg++;
+    qcg = OD_SHL(qg, OD_CGAIN_SHIFT);
+    if (qg == 0) *skip = OD_PVQ_SKIP_ZERO;
+  }
+  k = od_pvq_compute_k(qcg, itheta, theta, *noref, n, beta, nodesync);
+  if (k != 0) {
+    /* when noref==0, y is actually size n-1 */
+    od_decode_pvq_codeword(ec, &adapt->pvq.pvq_codeword_ctx, y, n - !*noref,
+     k);
+  }
+  else {
+    OD_CLEAR(y, n);
+  }
+  if (*skip) {
+    if (*skip == OD_PVQ_SKIP_COPY) OD_COPY(out, ref, n);
+    else OD_CLEAR(out, n);
+  }
+  else {
+    od_val32 g;
+    g = od_gain_expand(qcg, q0, beta);
+    pvq_synthesis(out, y, ref16, n, gr, *noref, g, theta, qm_inv, rshift);
+  }
+  *skip = !!*skip;
+/** Decodes a coefficient block (except for DC) encoded using PVQ
+ *
+ * @param [in,out] dec     daala decoder context
+ * @param [in]     ref     'reference' (prediction) vector
+ * @param [out]    out     decoded partition
+ * @param [in]     q0      quantizer
+ * @param [in]     pli     plane index
+ * @param [in]     bs      log of the block size minus two
+ * @param [in]     beta    per-band activity masking beta param
+ * @param [in]     robust  stream is robust to error in the reference
+ * @param [in]     is_keyframe whether we're encoding a keyframe
+ * @param [out]    flags   bitmask of the per band skip and noref flags
+ * @param [in]     block_skip skip flag for the block (range 0-3)
+ * @param [in]     qm      QM with magnitude compensation
+ * @param [in]     qm_inv  Inverse of QM with magnitude compensation
+ */
+void od_pvq_decode(daala_dec_ctx *dec,
+                   od_coeff *ref,
+                   od_coeff *out,
+                   int q0,
+                   int pli,
+                   int bs,
+                   const od_val16 *beta,
+                   int robust,
+                   int is_keyframe,
+                   unsigned int *flags,
+                   int block_skip,
+                   const int16_t *qm,
+                   const int16_t *qm_inv){
+  int noref[PVQ_MAX_PARTITIONS];
+  int skip[PVQ_MAX_PARTITIONS];
+  int *exg;
+  int *ext;
+  int nb_bands;
+  int i;
+  const int *off;
+  int size[PVQ_MAX_PARTITIONS];
+  generic_encoder *model;
+  int skip_rest[3] = {0};
+  cfl_ctx cfl;
+  /* const unsigned char *pvq_qm; */
+  /*Default to skip=1 and noref=0 for all bands.*/
+  for (i = 0; i < PVQ_MAX_PARTITIONS; i++) {
+    noref[i] = 0;
+    skip[i] = 1;
+  }
+  /* TODO(yushin): Enable this for activity masking,
+     when pvq_qm_q4 is available in AOM. */
+  /*pvq_qm = &dec->state.pvq_qm_q4[pli][0];*/
+  exg = &dec->state.adapt.pvq.pvq_exg[pli][bs][0];
+  ext = dec->state.adapt.pvq.pvq_ext + bs*PVQ_MAX_PARTITIONS;
+  model = dec->state.adapt.pvq.pvq_param_model;
+  nb_bands = OD_BAND_OFFSETS[bs][0];
+  off = &OD_BAND_OFFSETS[bs][1];
+  OD_ASSERT(block_skip < 4);
+  out[0] = block_skip & 1;
+  if (!(block_skip >> 1)) {
+    if (is_keyframe) for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = 0;
+    else for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = ref[i];
+  }
+  else {
+    for (i = 0; i < nb_bands; i++) size[i] = off[i+1] - off[i];
+    cfl.ref = ref;
+    cfl.nb_coeffs = off[nb_bands];
+    cfl.allow_flip = pli != 0 && is_keyframe;
+    for (i = 0; i < nb_bands; i++) {
+      int q;
+      /* TODO(yushin): Enable this for activity masking,
+         when pvq_qm_q4 is available in AOM. */
+      /*q = OD_MAXI(1, q0*pvq_qm[od_qm_get_index(bs, i + 1)] >> 4);*/
+      q = OD_MAXI(1, q0);
+      pvq_decode_partition(dec->ec, q, size[i],
+       model, &dec->state.adapt, exg + i, ext + i, ref + off[i], out + off[i],
+       &noref[i], beta[i], robust, is_keyframe, pli,
+       &cfl, i == 0 && (i < nb_bands - 1), skip_rest, i, &skip[i],
+       qm + off[i], qm_inv + off[i]);
+      if (i == 0 && !skip_rest[0] && bs > 0) {
+        int skip_dir;
+        int j;
+        skip_dir = od_decode_cdf_adapt(dec->ec,
+         &dec->state.adapt.pvq.pvq_skip_dir_cdf[(pli != 0) + 2*(bs - 1)][0], 7,
+         dec->state.adapt.pvq.pvq_skip_dir_increment, "pvq:skiprest");
+        for (j = 0; j < 3; j++) skip_rest[j] = !!(skip_dir & (1 << j));
+      }
+    }
+  }
+  *flags = 0;
+  for (i = nb_bands - 1; i >= 0; i--) {
+    *flags <<= 1;
+    *flags |= noref[i]&1;
+    *flags <<= 1;
+    *flags |= skip[i]&1;
+  }
diff --git a/av1/decoder/pvq_decoder.h b/av1/decoder/pvq_decoder.h
new file mode 100644
index 0000000..d749040
--- /dev/null
+++ b/av1/decoder/pvq_decoder.h
@@ -0,0 +1,45 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+/* clang-format off */
+#if !defined(_pvq_decoder_H)
+# define _pvq_decoder_H (1)
+# include "aom_dsp/entdec.h"
+# include "av1/common/pvq.h"
+# include "av1/decoder/decint.h"
+void od_decode_band_pvq_splits(od_ec_dec *ec, od_pvq_codeword_ctx *adapt,
+ od_coeff *y, int n, int k, int level);
+# define laplace_decode_special(dec, decay, max, str) od_laplace_decode_special_(dec, decay, max, str)
+# define laplace_decode(dec, ex_q8, k, str) od_laplace_decode_(dec, ex_q8, k, str)
+#define laplace_decode_vector(dec, y, n, k, curr, means, str) od_laplace_decode_vector_(dec, y, n, k, curr, means, str)
+# define laplace_decode_special(dec, decay, max, str) od_laplace_decode_special_(dec, decay, max)
+# define laplace_decode(dec, ex_q8, k, str) od_laplace_decode_(dec, ex_q8, k)
+#define laplace_decode_vector(dec, y, n, k, curr, means, str) od_laplace_decode_vector_(dec, y, n, k, curr, means)
+int od_laplace_decode_special_(od_ec_dec *dec, unsigned decay, int max OD_ACC_STR);
+int od_laplace_decode_(od_ec_dec *dec, unsigned ex_q8, int k OD_ACC_STR);
+void od_laplace_decode_vector_(od_ec_dec *dec, od_coeff *y, int n, int k,
+                                  int32_t *curr, const int32_t *means
+                                  OD_ACC_STR);
+void od_pvq_decode(daala_dec_ctx *dec, od_coeff *ref, od_coeff *out, int q0,
+ int pli, int bs, const od_val16 *beta, int robust, int is_keyframe,
+ unsigned int *flags, int block_skip, const int16_t *qm,
+ const int16_t *qm_inv);
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 156c4f4..95e86c6 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -46,6 +46,9 @@
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/subexp.h"
 #include "av1/encoder/tokenize.h"
+#include "av1/encoder/pvq_encoder.h"
 static struct av1_token intra_mode_encodings[INTRA_MODES];
 static struct av1_token switchable_interp_encodings[SWITCHABLE_FILTERS];
@@ -451,6 +454,7 @@
 #endif  // CONFIG_PALETTE
 static void pack_mb_tokens(aom_writer *w, TOKENEXTRA **tp,
                            const TOKENEXTRA *const stop,
                            aom_bit_depth_t bit_depth, const TX_SIZE tx) {
@@ -551,6 +555,7 @@
   *tp = p;
 static void write_segment_id(aom_writer *w, const struct segmentation *seg,
                              struct segmentation_probs *segp, int segment_id) {
@@ -1054,6 +1059,20 @@
+PVQ_INFO *get_pvq_block(PVQ_QUEUE *pvq_q) {
+  PVQ_INFO *pvq;
+  assert(pvq_q->curr_pos <= pvq_q->last_pos);
+  assert(pvq_q->curr_pos < pvq_q->buf_len);
+  pvq = pvq_q->buf + pvq_q->curr_pos;
+  ++pvq_q->curr_pos;
+  return pvq;
 static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
                           aom_writer *w, TOKENEXTRA **tok,
                           const TOKENEXTRA *const tok_end, int mi_row,
@@ -1062,7 +1081,14 @@
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   MODE_INFO *m;
   int plane;
+  MB_MODE_INFO *mbmi;
+  BLOCK_SIZE bsize;
+  od_adapt_ctx *adapt;
+  (void)tok;
+  (void)tok_end;
   xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
   m = xd->mi[0];
@@ -1071,6 +1097,12 @@
   set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
                  mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type],
                  cm->mi_rows, cm->mi_cols);
+  mbmi = &m->mbmi;
+  bsize = mbmi->sb_type;
+  adapt = &cpi->td.mb.daala_enc.state.adapt;
   if (frame_is_intra_only(cm)) {
     write_mb_modes_kf(cm, xd, xd->mi, w);
   } else {
@@ -1092,6 +1124,7 @@
 #endif  // CONFIG_PALETTE
   if (!m->mbmi.skip) {
     assert(*tok < tok_end);
     for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
@@ -1102,6 +1135,104 @@
+  // PVQ writes its tokens (i.e. symbols) here.
+  if (!m->mbmi.skip) {
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      PVQ_INFO *pvq;
+      TX_SIZE tx_size =
+          plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane]) : m->mbmi.tx_size;
+      int idx, idy;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      int num_4x4_w;
+      int num_4x4_h;
+      int max_blocks_wide;
+      int max_blocks_high;
+      int step = (1 << tx_size);
+      const int step_xy = 1 << (tx_size << 1);
+      int block = 0;
+      if (tx_size == TX_4X4 && bsize <= BLOCK_8X8) {
+        num_4x4_w = 2 >> xd->plane[plane].subsampling_x;
+        num_4x4_h = 2 >> xd->plane[plane].subsampling_y;
+      } else {
+        num_4x4_w =
+            num_4x4_blocks_wide_lookup[bsize] >> xd->plane[plane].subsampling_x;
+        num_4x4_h =
+            num_4x4_blocks_high_lookup[bsize] >> xd->plane[plane].subsampling_y;
+      }
+      // TODO: Do we need below for 4x4,4x8,8x4 cases as well?
+      max_blocks_wide =
+          num_4x4_w + (xd->mb_to_right_edge >= 0
+                           ? 0
+                           : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+      max_blocks_high =
+          num_4x4_h + (xd->mb_to_bottom_edge >= 0
+                           ? 0
+                           : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+      // TODO(yushin) Try to use av1_foreach_transformed_block_in_plane().
+      // Logic like the mb_to_right_edge/mb_to_bottom_edge stuff should
+      // really be centralized in one place.
+      for (idy = 0; idy < max_blocks_high; idy += step) {
+        for (idx = 0; idx < max_blocks_wide; idx += step) {
+          const int is_keyframe = 0;
+          const int encode_flip = 0;
+          const int flip = 0;
+          const int robust = 1;
+          int i;
+          const int has_dc_skip = 1;
+          int *exg = &adapt->pvq.pvq_exg[plane][tx_size][0];
+          int *ext = adapt->pvq.pvq_ext + tx_size * PVQ_MAX_PARTITIONS;
+          generic_encoder *model = adapt->pvq.pvq_param_model;
+          pvq = get_pvq_block(cpi->td.mb.pvq_q);
+          // encode block skip info
+          od_encode_cdf_adapt(&w->ec, pvq->ac_dc_coded,
+                              adapt->skip_cdf[2 * tx_size + (plane != 0)], 4,
+                              adapt->skip_increment);
+          // AC coeffs coded?
+          if (pvq->ac_dc_coded & 0x02) {
+            assert(pvq->bs <= tx_size);
+            for (i = 0; i < pvq->nb_bands; i++) {
+              if (i == 0 || (!pvq->skip_rest &&
+                             !(pvq->skip_dir & (1 << ((i - 1) % 3))))) {
+                pvq_encode_partition(
+                    &w->ec, pvq->qg[i], pvq->theta[i], pvq->max_theta[i],
+                    pvq->y + pvq->off[i], pvq->size[i], pvq->k[i], model, adapt,
+                    exg + i, ext + i, robust || is_keyframe,
+                    (plane != 0) * OD_NBSIZES * PVQ_MAX_PARTITIONS +
+                        pvq->bs * PVQ_MAX_PARTITIONS + i,
+                    is_keyframe, i == 0 && (i < pvq->nb_bands - 1),
+                    pvq->skip_rest, encode_flip, flip);
+              }
+              if (i == 0 && !pvq->skip_rest && pvq->bs > 0) {
+                od_encode_cdf_adapt(
+                    &w->ec, pvq->skip_dir,
+                    &adapt->pvq
+                         .pvq_skip_dir_cdf[(plane != 0) + 2 * (pvq->bs - 1)][0],
+                    7, adapt->pvq.pvq_skip_dir_increment);
+              }
+            }
+          }
+          // Encode residue of DC coeff, if exist.
+          if (!has_dc_skip || (pvq->ac_dc_coded & 1)) {  // DC coded?
+            generic_encode(&w->ec, &adapt->model_dc[plane],
+                           abs(pvq->dq_dc_residue) - has_dc_skip, -1,
+                           &adapt->ex_dc[plane][pvq->bs][0], 2);
+          }
+          if ((pvq->ac_dc_coded & 1)) {  // DC coded?
+            od_ec_enc_bits(&w->ec, pvq->dq_dc_residue < 0, 1);
+          }
+          block += step_xy;
+        }
+      }  // for (idy = 0;
+    }    // for (plane =
+  }      // if (!m->mbmi.skip)
 static void write_partition(const AV1_COMMON *const cm,
@@ -1233,6 +1364,9 @@
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   int mi_row, mi_col;
+  assert(cpi->td.mb.pvq_q->curr_pos == 0);
   if (cpi->common.delta_q_present_flag) {
     xd->prev_qindex = cpi->common.base_qindex;
@@ -1246,8 +1380,16 @@
          mi_col += MAX_MIB_SIZE)
       write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, BLOCK_64X64);
+  // Check that the number of PVQ blocks encoded and written to the bitstream
+  // are the same
+  assert(cpi->td.mb.pvq_q->curr_pos == cpi->td.mb.pvq_q->last_pos);
+  // Reset curr_pos in case we repack the bitstream
+  cpi->td.mb.pvq_q->curr_pos = 0;
 static void build_tree_distribution(AV1_COMP *cpi, TX_SIZE tx_size,
                                     av1_coeff_stats *coef_branch_ct,
                                     av1_coeff_probs_model *coef_probs) {
@@ -1441,6 +1583,7 @@
 static void encode_loopfilter(struct loopfilter *lf,
                               struct aom_write_bit_buffer *wb) {
@@ -1796,6 +1939,9 @@
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
       const int tile_idx = tile_row * tile_cols + tile_col;
       unsigned int tile_size;
+      TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
       TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
       const int is_last_tile = tile_idx == tile_rows * tile_cols - 1;
@@ -1837,12 +1983,20 @@
       aom_start_encode(&residual_bc, data_ptr + total_size + 4 * !is_last_tile);
+      // NOTE: This will not work with CONFIG_ANS turned on.
+      od_adapt_ctx_reset(&cpi->td.mb.daala_enc.state.adapt, 0);
+      cpi->td.mb.pvq_q = &this_tile->pvq_q;
       write_modes(cpi, &cpi->tile_data[tile_idx].tile_info, &residual_bc, &tok,
       assert(tok == tok_end);
       tile_size = residual_bc.pos - 1;
+      cpi->td.mb.pvq_q = NULL;
       assert(tile_size > 0);
       if (!is_last_tile) {
         // size of this tile
@@ -2165,7 +2319,9 @@
   update_txfm_probs(cm, header_bc, counts);
   update_coef_probs(cpi, header_bc);
   update_skip_probs(cm, header_bc, counts);
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 7e319c4..211ae58 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -14,6 +14,9 @@
 #include "av1/common/entropymv.h"
 #include "av1/common/entropy.h"
+#include "av1/encoder/encint.h"
 #include "av1/common/mvref_common.h"
@@ -22,6 +25,12 @@
 extern "C" {
+// Maximum possible # of tx blocks in luma plane, which is currently 256,
+// since there can be 16x16 of 4x4 tx.
 typedef struct {
   unsigned int sse;
   int sum;
@@ -30,6 +39,9 @@
 struct macroblock_plane {
   DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
+  DECLARE_ALIGNED(16, int16_t, src_int16[MAX_SB_SQUARE]);
   tran_low_t *qcoeff;
   tran_low_t *coeff;
   uint16_t *eobs;
@@ -151,6 +163,25 @@
   // Used to store sub partition's choices.
   MV pred_mv[MAX_REF_FRAMES];
+  int rate;
+  // 1 if neither AC nor DC is coded. Only used during RDO.
+  int pvq_skip[MAX_MB_PLANE];
+  PVQ_QUEUE *pvq_q;
+  // Storage for PVQ tx block encodings in a superblock.
+  // There can be max 16x16 of 4x4 blocks (and YUV) encode by PVQ
+  // 256 is the max # of 4x4 blocks in a SB (64x64), which comes from:
+  // 1) Since PVQ is applied to each trasnform-ed block
+  // 2) 4x4 is the smallest tx size in AV1
+  // 3) AV1 allows using smaller tx size than block (i.e. partition) size
+  // TODO(yushin) : The memory usage could be improved a lot, since this has
+  // storage for 10 bands and 128 coefficients for every 4x4 block,
+  daala_enc_ctx daala_enc;
+  int pvq_speed;
+  int pvq_coded;  // Indicates whether pvq_info needs be stored to tokenize
 #ifdef __cplusplus
diff --git a/av1/encoder/context_tree.c b/av1/encoder/context_tree.c
index 9b21a1d..b7b5cbe 100644
--- a/av1/encoder/context_tree.c
+++ b/av1/encoder/context_tree.c
@@ -30,6 +30,10 @@
                     aom_memalign(32, num_pix * sizeof(*ctx->qcoeff[i])));
     CHECK_MEM_ERROR(cm, ctx->dqcoeff[i],
                     aom_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i])));
+    CHECK_MEM_ERROR(cm, ctx->pvq_ref_coeff[i],
+                    aom_memalign(32, num_pix * sizeof(*ctx->pvq_ref_coeff[i])));
     CHECK_MEM_ERROR(cm, ctx->eobs[i],
                     aom_memalign(32, num_blk * sizeof(*ctx->eobs[i])));
@@ -54,6 +58,10 @@
     ctx->qcoeff[i] = 0;
     ctx->dqcoeff[i] = 0;
+    aom_free(ctx->pvq_ref_coeff[i]);
+    ctx->pvq_ref_coeff[i] = 0;
     ctx->eobs[i] = 0;
diff --git a/av1/encoder/context_tree.h b/av1/encoder/context_tree.h
index c482e13..4f1c647 100644
--- a/av1/encoder/context_tree.h
+++ b/av1/encoder/context_tree.h
@@ -33,6 +33,9 @@
   tran_low_t *coeff[MAX_MB_PLANE];
   tran_low_t *qcoeff[MAX_MB_PLANE];
   tran_low_t *dqcoeff[MAX_MB_PLANE];
+  tran_low_t *pvq_ref_coeff[MAX_MB_PLANE];
   uint16_t *eobs[MAX_MB_PLANE];
   int num_4x4_blk;
diff --git a/av1/encoder/daala_compat_enc.c b/av1/encoder/daala_compat_enc.c
new file mode 100644
index 0000000..c23b26d
--- /dev/null
+++ b/av1/encoder/daala_compat_enc.c
@@ -0,0 +1,22 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+#include "encint.h"
+void od_encode_checkpoint(const daala_enc_ctx *enc, od_rollback_buffer *rbuf) {
+  od_ec_enc_checkpoint(&rbuf->ec, &enc->ec);
+  OD_COPY(&rbuf->adapt, &enc->state.adapt, 1);
+void od_encode_rollback(daala_enc_ctx *enc, const od_rollback_buffer *rbuf) {
+  od_ec_enc_rollback(&enc->ec, &rbuf->ec);
+  OD_COPY(&enc->state.adapt, &rbuf->adapt, 1);
diff --git a/av1/encoder/encint.h b/av1/encoder/encint.h
new file mode 100644
index 0000000..1e3516c
--- /dev/null
+++ b/av1/encoder/encint.h
@@ -0,0 +1,51 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+/* clang-format off */
+#if !defined(_encint_H)
+# define _encint_H (1)
+typedef struct daala_enc_ctx od_enc_ctx;
+typedef struct od_params_ctx od_params_ctx;
+typedef struct od_rollback_buffer od_rollback_buffer;
+# include "aom_dsp/entenc.h"
+# include "av1/common/odintrin.h"
+# include "av1/common/pvq_state.h"
+struct daala_enc_ctx{
+  /* Stores context-adaptive CDFs for PVQ. */
+  od_state state;
+  /* Daala entropy encoder. */
+  od_ec_enc ec;
+  int use_activity_masking;
+  /* Mode of quantization matrice : FLAT (0) or HVS (1) */
+  int qm;
+  /*Normalized PVQ lambda for use where we've already performed
+     quantization.*/
+  double pvq_norm_lambda;
+  double pvq_norm_lambda_dc;
+// from daalaenc.h
+/**The encoder context.*/
+typedef struct daala_enc_ctx daala_enc_ctx;
+/** Holds important encoder information so we can roll back decisions */
+struct od_rollback_buffer {
+  od_ec_enc ec;
+  od_adapt_ctx adapt;
+void od_encode_checkpoint(const daala_enc_ctx *enc, od_rollback_buffer *rbuf);
+void od_encode_rollback(daala_enc_ctx *enc, const od_rollback_buffer *rbuf);
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 839f961..c385504 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -47,6 +47,10 @@
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/tokenize.h"
+#include "av1/encoder/pvq_encoder.h"
 static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
                               TOKENEXTRA **t, int output_enabled, int mi_row,
                               int mi_col, BLOCK_SIZE bsize,
@@ -941,6 +945,9 @@
     p[i].coeff = ctx->coeff[i];
     p[i].qcoeff = ctx->qcoeff[i];
     pd[i].dqcoeff = ctx->dqcoeff[i];
+    pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i];
     p[i].eobs = ctx->eobs[i];
@@ -1015,6 +1022,11 @@
   // Use the lower precision, but faster, 32x32 fdct for mode selection.
   x->use_lp32x32fdct = 1;
+  x->pvq_speed = 1;
+  x->pvq_coded = 0;
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
   mbmi = &xd->mi[0]->mbmi;
   mbmi->sb_type = bsize;
@@ -1023,6 +1035,9 @@
     p[i].coeff = ctx->coeff[i];
     p[i].qcoeff = ctx->qcoeff[i];
     pd[i].dqcoeff = ctx->dqcoeff[i];
+    pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i];
     p[i].eobs = ctx->eobs[i];
@@ -1431,6 +1446,9 @@
                             ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                             ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                             PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
+                            od_rollback_buffer *rdo_buf,
                             BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   int p;
@@ -1453,12 +1471,18 @@
          sizeof(*xd->above_seg_context) * mi_width);
   memcpy(xd->left_seg_context + (mi_row & MAX_MIB_MASK), sl,
          sizeof(xd->left_seg_context[0]) * mi_height);
+  od_encode_rollback(&x->daala_enc, rdo_buf);
 static void save_context(MACROBLOCK *const x, int mi_row, int mi_col,
                          ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                          ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                          PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
+                         od_rollback_buffer *rdo_buf,
                          BLOCK_SIZE bsize) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   int p;
@@ -1483,6 +1507,9 @@
          sizeof(*xd->above_seg_context) * mi_width);
   memcpy(sl, xd->left_seg_context + (mi_row & MAX_MIB_MASK),
          sizeof(xd->left_seg_context[0]) * mi_height);
+  od_encode_checkpoint(&x->daala_enc, rdo_buf);
 static void encode_b(const AV1_COMP *const cpi, const TileInfo *const tile,
@@ -1661,6 +1688,9 @@
   BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
   int do_partition_search = 1;
   PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
+  od_rollback_buffer pre_rdo_buf;
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
@@ -1675,7 +1705,11 @@
   subsize = get_subsize(bsize, partition);
   pc_tree->partitioning = partition;
   save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  save_context(x, mi_row, mi_col, a, l, sa, sl, &pre_rdo_buf, bsize);
   if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) {
     set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
@@ -1715,7 +1749,11 @@
             RDCOST(x->rdmult, x->rddiv, none_rdc.rate, none_rdc.dist);
       restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+      restore_context(x, mi_row, mi_col, a, l, sa, sl, &pre_rdo_buf, bsize);
       mi_8x8[0]->mbmi.sb_type = bs_type;
       pc_tree->partitioning = partition;
@@ -1819,7 +1857,11 @@
     BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
     chosen_rdc.rate = 0;
     chosen_rdc.dist = 0;
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, &pre_rdo_buf, bsize);
     pc_tree->partitioning = PARTITION_SPLIT;
     // Split partition.
@@ -1829,18 +1871,27 @@
       RD_COST tmp_rdc;
       ENTROPY_CONTEXT l2[16 * MAX_MB_PLANE], a2[16 * MAX_MB_PLANE];
       PARTITION_CONTEXT sl2[8], sa2[8];
+      od_rollback_buffer buf;
       if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
       save_context(x, mi_row, mi_col, a2, l2, sa2, sl2, bsize);
+      save_context(x, mi_row, mi_col, a2, l2, sa2, sl2, &buf, bsize);
       pc_tree->split[i]->partitioning = PARTITION_NONE;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
                        &tmp_rdc, split_subsize, &pc_tree->split[i]->none,
       restore_context(x, mi_row, mi_col, a2, l2, sa2, sl2, bsize);
+      restore_context(x, mi_row, mi_col, a2, l2, sa2, sl2, &buf, bsize);
       if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
@@ -1877,7 +1928,11 @@
     chosen_rdc = none_rdc;
   restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  restore_context(x, mi_row, mi_col, a, l, sa, sl, &pre_rdo_buf, bsize);
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
@@ -2169,6 +2224,11 @@
       !force_vert_split && yss <= xss && bsize_at_least_8x8;
   int partition_vert_allowed =
       !force_horz_split && xss <= yss && bsize_at_least_8x8;
+  od_rollback_buffer pre_rdo_buf;
   assert(num_8x8_blocks_wide_lookup[bsize] ==
@@ -2209,8 +2269,11 @@
     partition_horz_allowed &= force_horz_split;
     partition_vert_allowed &= force_vert_split;
   save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  save_context(x, mi_row, mi_col, a, l, sa, sl, &pre_rdo_buf, bsize);
   if (cpi->use_fp_mb_stats) {
@@ -2355,7 +2418,11 @@
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, &pre_rdo_buf, bsize);
   // store estimated motion vector
@@ -2418,7 +2485,11 @@
       // gives better rd cost
       do_rectangular_split &= !partition_none_allowed;
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, &pre_rdo_buf, bsize);
@@ -2466,8 +2537,13 @@
         pc_tree->partitioning = PARTITION_HORZ;
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, &pre_rdo_buf, bsize);
   if (partition_vert_allowed &&
       (do_rectangular_split || av1_active_v_edge(cpi, mi_col, mi_step))) {
@@ -2513,7 +2589,11 @@
         pc_tree->partitioning = PARTITION_VERT;
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, &pre_rdo_buf, bsize);
   // TODO(jbb): This code added so that we avoid static analysis
@@ -2531,7 +2611,9 @@
   if (bsize == BLOCK_64X64) {
     assert(tp_orig < *tp || (tp_orig == *tp && xd->mi[0]->mbmi.skip));
     assert(best_rdc.rate < INT_MAX);
     assert(best_rdc.dist < INT64_MAX);
   } else {
@@ -2745,6 +2827,13 @@
             tile_data->mode_map[i][j] = j;
+        // This will be dynamically increased as more pvq block is encoded.
+        tile_data->pvq_q.buf_len = 1000;
+        CHECK_MEM_ERROR(cm, tile_data->pvq_q.buf,
+                        aom_malloc(tile_data->pvq_q.buf_len * sizeof(PVQ_INFO)));
+        tile_data->pvq_q.curr_pos = 0;
@@ -2757,6 +2846,9 @@
       cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
       pre_tok = cpi->tile_tok[tile_row][tile_col];
       tile_tok = allocated_tokens(*tile_info);
+      cpi->tile_data[tile_row * tile_cols + tile_col].pvq_q.curr_pos = 0;
@@ -2769,11 +2861,43 @@
   const TileInfo *const tile_info = &this_tile->tile_info;
   TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
   int mi_row;
+  od_adapt_ctx *adapt;
   // Set up pointers to per thread motion search counters.
   td->mb.m_search_count_ptr = &td->rd_counts.m_search_count;
   td->mb.ex_search_count_ptr = &td->rd_counts.ex_search_count;
+  td->mb.pvq_q = &this_tile->pvq_q;
+  // TODO(yushin)
+  // If activity masking is enabled, change below to OD_HVS_QM
+  td->mb.daala_enc.qm = OD_FLAT_QM;  // Hard coded. Enc/dec required to sync.
+  {
+    // FIXME: Multiple segments support
+    int segment_id = 0;
+    int rdmult = set_segment_rdmult(cpi, &td->mb, segment_id);
+    int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+    int64_t q_ac = av1_ac_quant(qindex, 0, cpi->common.bit_depth);
+    int64_t q_dc = av1_dc_quant(qindex, 0, cpi->common.bit_depth);
+    /* td->mb.daala_enc.pvq_norm_lambda = OD_PVQ_LAMBDA; */
+    td->mb.daala_enc.pvq_norm_lambda =
+        (double)rdmult * (64 / 16) / (q_ac * q_ac * (1 << RDDIV_BITS));
+    td->mb.daala_enc.pvq_norm_lambda_dc =
+        (double)rdmult * (64 / 16) / (q_dc * q_dc * (1 << RDDIV_BITS));
+    // printf("%f\n", td->mb.daala_enc.pvq_norm_lambda);
+  }
+  od_init_qm(td->mb.daala_enc.state.qm, td->mb.daala_enc.state.qm_inv,
+             td->mb.daala_enc.qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT);
+  od_ec_enc_init(&td->, 65025);
+  adapt = &td->mb.daala_enc.state.adapt;
+  od_ec_enc_reset(&td->;
+  od_adapt_ctx_reset(adapt, 0);
   for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
        mi_row += MAX_MIB_SIZE) {
     encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
@@ -2782,6 +2906,16 @@
       (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
   assert(tok - cpi->tile_tok[tile_row][tile_col] <=
+  od_ec_enc_clear(&td->;
+  td->mb.pvq_q->last_pos = td->mb.pvq_q->curr_pos;
+  // rewind current position so that bitstream can be written
+  // from the 1st pvq block
+  td->mb.pvq_q->curr_pos = 0;
+  td->mb.pvq_q = NULL;
 static void encode_tiles(AV1_COMP *cpi) {
@@ -3065,6 +3199,11 @@
   x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
+  x->pvq_speed = 0;
+  x->pvq_coded = output_enabled ? 1 : 0;
   if (!is_inter_block(mbmi)) {
     int plane;
     mbmi->skip = 1;
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 4c938c0..1b87589 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -27,6 +27,12 @@
 #include "av1/encoder/rd.h"
 #include "av1/encoder/tokenize.h"
+#include "av1/encoder/encint.h"
+#include "av1/common/partition.h"
+#include "av1/encoder/pvq_encoder.h"
 struct optimize_ctx {
@@ -63,6 +69,7 @@
   short qc;
 } av1_token_state;
 // TODO(jimbankoski): experiment to find optimal RD numbers.
 static const int plane_rd_mult[PLANE_TYPES] = { 4, 2 };
@@ -328,6 +335,7 @@
   mb->plane[plane].eobs[block] = final_eob;
   return final_eob;
 // TODO(sarahparker) refactor fwd quant functions to use fwd_txfm fns in
 // hybrid_fwd_txfm.c
@@ -335,8 +343,13 @@
                         int block, int blk_row, int blk_col,
                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
   PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
   TX_TYPE tx_type = get_tx_type(plane_type, xd, block);
   const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type);
@@ -345,12 +358,13 @@
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   int seg_id = xd->mi[0]->mbmi.segment_id;
   int is_intra = !is_inter_block(&xd->mi[0]->mbmi);
   const qm_val_t *qmatrix = pd->seg_qmatrix[seg_id][is_intra][tx_size];
   const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][is_intra][tx_size];
   const int16_t *src_diff;
@@ -364,6 +378,40 @@
   src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
+  uint8_t *src, *dst;
+  int16_t *src_int16, *pred;
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  int tx_blk_size;
+  int i, j;
+  int skip = 1;
+  PVQ_INFO *pvq_info = NULL;
+  (void)scan_order;
+  (void)qcoeff;
+  if (x->pvq_coded) {
+    assert(block < MAX_PVQ_BLOCKS_IN_SB);
+    pvq_info = &x->pvq[block][plane];
+  }
+  dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
+  src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
+  src_int16 = &p->src_int16[4 * (blk_row * diff_stride + blk_col)];
+  pred = &pd->pred[4 * (blk_row * diff_stride + blk_col)];
+  // transform block size in pixels
+  tx_blk_size = tx_size_1d[tx_size];
+  // copy uint8 orig and predicted block to int16 buffer
+  // in order to use existing VP10 transform functions
+  for (j = 0; j < tx_blk_size; j++)
+    for (i = 0; i < tx_blk_size; i++) {
+      src_int16[diff_stride * j + i] = src[src_stride * j + i];
+      pred[diff_stride * j + i] = dst[dst_stride * j + i];
+    }
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -402,7 +450,7 @@
       case TX_4X4:
-        if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+        if (xd->lossless[seg_id]) {
           av1_highbd_fwht4x4(src_diff, coeff, diff_stride);
         } else {
           aom_highbd_fdct4x4(src_diff, coeff, diff_stride);
@@ -422,6 +470,7 @@
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
@@ -456,7 +505,7 @@
     case TX_4X4:
-      if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+      if (xd->lossless[seg_id]) {
         av1_fwht4x4(src_diff, coeff, diff_stride);
       } else {
         aom_fdct4x4(src_diff, coeff, diff_stride);
@@ -472,14 +521,68 @@
     default: assert(0); break;
+#else   // #if !CONFIG_PVQ
+  switch (tx_size) {
+    case TX_32X32:
+      // NOTE: Using x->use_lp32x32fdct == 1 will makes enc and dec mismatched,
+      // because decoder always uses x->use_lp32x32fdct == 0,
+      // forward transform of predicted image.
+      fdct32x32(0, pred, ref_coeff, diff_stride);
+      // forward transform of original image.
+      fdct32x32(0, src_int16, coeff, diff_stride);
+      break;
+    case TX_16X16:
+      aom_fdct16x16(pred, ref_coeff, diff_stride);
+      aom_fdct16x16(src_int16, coeff, diff_stride);
+      break;
+    case TX_8X8:
+      aom_fdct8x8(pred, ref_coeff, diff_stride);
+      aom_fdct8x8(src_int16, coeff, diff_stride);
+      break;
+    case TX_4X4:
+      if (xd->lossless[seg_id]) {
+        av1_fwht4x4(pred, ref_coeff, diff_stride);
+        av1_fwht4x4(src_int16, coeff, diff_stride);
+      } else {
+        aom_fdct4x4(pred, ref_coeff, diff_stride);
+        aom_fdct4x4(src_int16, coeff, diff_stride);
+      }
+      break;
+    default: assert(0); break;
+  }
+  // PVQ for inter mode block
+  if (!x->skip_block)
+    skip = av1_pvq_encode_helper(&x->daala_enc,
+                                 coeff,        // target original vector
+                                 ref_coeff,    // reference vector
+                                 dqcoeff,      // de-quantized vector
+                                 eob,          // End of Block marker
+                                 pd->dequant,  // aom's quantizers
+                                 plane,        // image plane
+                                 tx_size,      // block size in log_2 - 2
+                                 tx_type,
+                                 &x->rate,  // rate measured
+                                 x->pvq_speed,
+                                 pvq_info);  // PVQ info for a block
+  x->pvq_skip[plane] = skip;
+  if (!skip) mbmi->skip = 0;
+#endif  // #if !CONFIG_PVQ
 void av1_xform_quant(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
                      int block, int blk_row, int blk_col,
                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
   PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
   TX_TYPE tx_type = get_tx_type(plane_type, xd, block);
   const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type);
@@ -489,22 +592,60 @@
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   int seg_id = xd->mi[0]->mbmi.segment_id;
+  FWD_TXFM_PARAM fwd_txfm_param;
   int is_intra = !is_inter_block(&xd->mi[0]->mbmi);
   const qm_val_t *qmatrix = pd->seg_qmatrix[seg_id][is_intra][tx_size];
   const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][is_intra][tx_size];
   const int16_t *src_diff;
-  FWD_TXFM_PARAM fwd_txfm_param;
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
+  uint8_t *src, *dst;
+  int16_t *src_int16, *pred;
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  int tx_blk_size;
+  int i, j;
+  int skip = 1;
+  PVQ_INFO *pvq_info = NULL;
+  (void)scan_order;
+  (void)qcoeff;
+  if (x->pvq_coded) {
+    assert(block < MAX_PVQ_BLOCKS_IN_SB);
+    pvq_info = &x->pvq[block][plane];
+  }
+  dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
+  src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
+  src_int16 = &p->src_int16[4 * (blk_row * diff_stride + blk_col)];
+  pred = &pd->pred[4 * (blk_row * diff_stride + blk_col)];
+  // transform block size in pixels
+  tx_blk_size = tx_size_1d[tx_size];
+  // copy uint8 orig and predicted block to int16 buffer
+  // in order to use existing VP10 transform functions
+  for (j = 0; j < tx_blk_size; j++)
+    for (i = 0; i < tx_blk_size; i++) {
+      src_int16[diff_stride * j + i] = src[src_stride * j + i];
+      pred[diff_stride * j + i] = dst[dst_stride * j + i];
+    }
   fwd_txfm_param.tx_type = tx_type;
   fwd_txfm_param.tx_size = tx_size;
   fwd_txfm_param.fwd_txfm_opt = FWD_TXFM_OPT_NORMAL;
   fwd_txfm_param.rd_transform = x->use_lp32x32fdct;
   fwd_txfm_param.lossless = xd->lossless[seg_id];
-  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
@@ -555,6 +696,7 @@
   fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
   switch (tx_size) {
     case TX_32X32:
@@ -599,6 +741,31 @@
     default: assert(0); break;
+#else   // #if !CONFIG_PVQ
+  fwd_txfm_param.rd_transform = 0;
+  fwd_txfm(src_int16, coeff, diff_stride, &fwd_txfm_param);
+  fwd_txfm(pred, ref_coeff, diff_stride, &fwd_txfm_param);
+  // PVQ for inter mode block
+  if (!x->skip_block)
+    skip = av1_pvq_encode_helper(&x->daala_enc,
+                                 coeff,        // target original vector
+                                 ref_coeff,    // reference vector
+                                 dqcoeff,      // de-quantized vector
+                                 eob,          // End of Block marker
+                                 pd->dequant,  // aom's quantizers
+                                 plane,        // image plane
+                                 tx_size,      // block size in log_2 - 2
+                                 tx_type,
+                                 &x->rate,  // rate measured
+                                 x->pvq_speed,
+                                 pvq_info);  // PVQ info for a block
+  x->pvq_skip[plane] = skip;
+  if (!skip) mbmi->skip = 0;
+#endif  // #if !CONFIG_PVQ
 static void encode_block(int plane, int block, int blk_row, int blk_col,
@@ -614,6 +781,10 @@
   uint8_t *dst;
   TX_TYPE tx_type = get_tx_type(pd->plane_type, xd, block);
+  int tx_blk_size;
+  int i, j;
   dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
   a = &ctx->ta[plane][blk_col];
   l = &ctx->tl[plane][blk_row];
@@ -626,6 +797,7 @@
   if (x->optimize) {
     const int combined_ctx = combine_entropy_contexts(*a, *l);
     *a = *l = optimize_b(cm, x, plane, block, tx_size, combined_ctx) > 0;
@@ -636,6 +808,24 @@
   if (p->eobs[block]) *(args->skip) = 0;
   if (p->eobs[block] == 0) return;
+  *a = *l = !x->pvq_skip[plane];
+  if (!x->pvq_skip[plane]) *(args->skip) = 0;
+  if (x->pvq_skip[plane]) return;
+  // transform block size in pixels
+  tx_blk_size = tx_size_1d[tx_size];
+  // Since av1 does not have separate function which does inverse transform
+  // but av1_inv_txfm_add_*x*() also does addition of predicted image to
+  // inverse transformed image,
+  // pass blank dummy image to av1_inv_txfm_add_*x*(), i.e. set dst as zeros
+  for (j = 0; j < tx_blk_size; j++)
+    for (i = 0; i < tx_blk_size; i++) dst[j * pd->dst.stride + i] = 0;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     switch (tx_size) {
@@ -665,7 +855,6 @@
   switch (tx_size) {
     case TX_32X32:
       av1_inv_txfm_add_32x32(dqcoeff, dst, pd->dst.stride, p->eobs[block],
@@ -710,7 +899,28 @@
   av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
   if (p->eobs[block] > 0) {
+  if (!x->pvq_skip[plane]) {
+    {
+      int tx_blk_size;
+      int i, j;
+      // transform block size in pixels
+      tx_blk_size = tx_size_1d[tx_size];
+      // Since av1 does not have separate function which does inverse transform
+      // but av1_inv_txfm_add_*x*() also does addition of predicted image to
+      // inverse transformed image,
+      // pass blank dummy image to av1_inv_txfm_add_*x*(), i.e. set dst as zeros
+      for (j = 0; j < tx_blk_size; j++)
+        for (i = 0; i < tx_blk_size; i++) dst[j * pd->dst.stride + i] = 0;
+    }
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       if (xd->lossless[0]) {
@@ -750,8 +960,9 @@
   if (x->skip) return;
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     av1_subtract_plane(x, bsize, plane);
     if (x->optimize) {
       const struct macroblockd_plane *const pd = &xd->plane[plane];
       const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
@@ -785,7 +996,6 @@
   const int bhl = b_height_log2_lookup[plane_bsize];
   const int diff_stride = 4 * (1 << bwl);
   uint8_t *src, *dst;
-  int16_t *src_diff;
   uint16_t *eob = &p->eobs[block];
   int seg_id = xd->mi[0]->mbmi.segment_id;
@@ -795,10 +1005,31 @@
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
+  FWD_TXFM_PARAM fwd_txfm_param;
+  int16_t *src_diff;
   int tx1d_size = tx_size_1d[tx_size];
-  FWD_TXFM_PARAM fwd_txfm_param;
+  tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
+  int16_t *src_int16;
+  int tx_blk_size;
+  int i, j;
+  int16_t *pred = &pd->pred[4 * (blk_row * diff_stride + blk_col)];
+  int skip = 1;
+  PVQ_INFO *pvq_info = NULL;
+  (void)scan_order;
+  (void)qcoeff;
+  if (x->pvq_coded) {
+    assert(block < MAX_PVQ_BLOCKS_IN_SB);
+    pvq_info = &x->pvq[block][plane];
+  }
+  src_int16 = &p->src_int16[4 * (blk_row * diff_stride + blk_col)];
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
   fwd_txfm_param.tx_type = tx_type;
   fwd_txfm_param.tx_size = tx_size;
   fwd_txfm_param.fwd_txfm_opt = FWD_TXFM_OPT_NORMAL;
@@ -807,8 +1038,6 @@
   dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
   src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
-  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
   mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
   av1_predict_intra_block(xd, bwl, bhl, tx_size, mode, dst, dst_stride, dst,
                           dst_stride, blk_col, blk_row, plane);
@@ -884,6 +1113,8 @@
   aom_subtract_block(tx1d_size, tx1d_size, src_diff, diff_stride, src,
                      src_stride, dst, dst_stride);
   fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
   switch (tx_size) {
     case TX_32X32:
@@ -939,7 +1170,78 @@
     default: assert(0); break;
+#else   // #if !CONFIG_PVQ
+  // transform block size in pixels
+  tx_blk_size = tx_size_1d[tx_size];
+  // copy uint8 orig and predicted block to int16 buffer
+  // in order to use existing VP10 transform functions
+  for (j = 0; j < tx_blk_size; j++)
+    for (i = 0; i < tx_blk_size; i++) {
+      src_int16[diff_stride * j + i] = src[src_stride * j + i];
+      pred[diff_stride * j + i] = dst[dst_stride * j + i];
+    }
+  fwd_txfm_param.rd_transform = 0;
+  fwd_txfm(src_int16, coeff, diff_stride, &fwd_txfm_param);
+  fwd_txfm(pred, ref_coeff, diff_stride, &fwd_txfm_param);
+  // PVQ for intra mode block
+  if (!x->skip_block)
+    skip = av1_pvq_encode_helper(&x->daala_enc,
+                                 coeff,        // target original vector
+                                 ref_coeff,    // reference vector
+                                 dqcoeff,      // de-quantized vector
+                                 eob,          // End of Block marker
+                                 pd->dequant,  // aom's quantizers
+                                 plane,        // image plane
+                                 tx_size,      // block size in log_2 - 2
+                                 tx_type,
+                                 &x->rate,  // rate measured
+                                 x->pvq_speed,
+                                 pvq_info);  // PVQ info for a block
+  x->pvq_skip[plane] = skip;
+  if (!skip) mbmi->skip = 0;
+  // Since av1 does not have separate function which does inverse transform
+  // but av1_inv_txfm_add_*x*() also does addition of predicted image to
+  // inverse transformed image,
+  // pass blank dummy image to av1_inv_txfm_add_*x*(), i.e. set dst as zeros
+  if (!skip) {
+    for (j = 0; j < tx_blk_size; j++)
+      for (i = 0; i < tx_blk_size; i++) dst[j * dst_stride + i] = 0;
+    switch (tx_size) {
+      case TX_32X32:
+        av1_inv_txfm_add_32x32(dqcoeff, dst, dst_stride, *eob, tx_type);
+        break;
+      case TX_16X16:
+        av1_inv_txfm_add_16x16(dqcoeff, dst, dst_stride, *eob, tx_type);
+        break;
+      case TX_8X8:
+        av1_inv_txfm_add_8x8(dqcoeff, dst, dst_stride, *eob, tx_type);
+        break;
+      case TX_4X4:
+        // this is like av1_short_idct4x4 but has a special case around eob<=1
+        // which is significant (not just an optimization) for the lossless
+        // case.
+        av1_inv_txfm_add_4x4(dqcoeff, dst, dst_stride, *eob, tx_type,
+                             xd->lossless[seg_id]);
+        break;
+      default: assert(0); break;
+    }
+  }
+#endif  // #if !CONFIG_PVQ
   if (*eob) *(args->skip) = 0;
+// Note : *(args->skip) == mbmi->skip
 void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,
@@ -950,3 +1252,141 @@
   av1_foreach_transformed_block_in_plane(xd, bsize, plane,
                                          av1_encode_block_intra, &arg);
+int av1_pvq_encode_helper(daala_enc_ctx *daala_enc, tran_low_t *const coeff,
+                          tran_low_t *ref_coeff, tran_low_t *const dqcoeff,
+                          uint16_t *eob, const int16_t *quant, int plane,
+                          int tx_size, TX_TYPE tx_type, int *rate, int speed,
+                          PVQ_INFO *pvq_info) {
+  const int tx_blk_size = tx_size_1d[tx_size];
+  int skip;
+  // TODO(yushin): Enable this later, when pvq_qm_q4 is available in AOM.
+  // int pvq_dc_quant = OD_MAXI(1,
+  //  quant * daala_enc->state.pvq_qm_q4[plane][od_qm_get_index(tx_size, 0)] >>
+  //  4);
+  int quant_shift = tx_size == TX_32X32 ? 1 : 0;
+  // DC quantizer for PVQ
+  int pvq_dc_quant = OD_MAXI(1, quant[0] >> quant_shift);
+  int tell;
+  int has_dc_skip = 1;
+  int i;
+  int off = od_qm_offset(tx_size, plane ? 1 : 0);
+  double save_pvq_lambda;
+  DECLARE_ALIGNED(16, int16_t, coeff_pvq[OD_BSIZE_MAX * OD_BSIZE_MAX]);
+  DECLARE_ALIGNED(16, int16_t, ref_coeff_pvq[OD_BSIZE_MAX * OD_BSIZE_MAX]);
+  DECLARE_ALIGNED(16, int16_t, dqcoeff_pvq[OD_BSIZE_MAX * OD_BSIZE_MAX]);
+  DECLARE_ALIGNED(16, int32_t, in_int32[OD_BSIZE_MAX * OD_BSIZE_MAX]);
+  DECLARE_ALIGNED(16, int32_t, ref_int32[OD_BSIZE_MAX * OD_BSIZE_MAX]);
+  DECLARE_ALIGNED(16, int32_t, out_int32[OD_BSIZE_MAX * OD_BSIZE_MAX]);
+  *eob = 0;
+  tell = od_ec_enc_tell_frac(&daala_enc->ec);
+  // Change coefficient ordering for pvq encoding.
+  od_raster_to_coding_order(coeff_pvq, tx_blk_size, tx_type, coeff,
+                            tx_blk_size);
+  od_raster_to_coding_order(ref_coeff_pvq, tx_blk_size, tx_type, ref_coeff,
+                            tx_blk_size);
+  // copy int16 inputs to int32
+  for (i = 0; i < tx_blk_size * tx_blk_size; i++) {
+    ref_int32[i] = ref_coeff_pvq[i];
+    in_int32[i] = coeff_pvq[i];
+  }
+  if (plane != 0) {
+    save_pvq_lambda = daala_enc->pvq_norm_lambda;
+    daala_enc->pvq_norm_lambda *= 0.8;
+  }
+  if (abs(in_int32[0] - ref_int32[0]) < pvq_dc_quant * 141 / 256) { /* 0.55 */
+    out_int32[0] = 0;
+  } else {
+    out_int32[0] = OD_DIV_R0(in_int32[0] - ref_int32[0], pvq_dc_quant);
+  }
+  skip = od_pvq_encode(
+      daala_enc, ref_int32, in_int32, out_int32,
+      (int)quant[0] >> quant_shift,  // scale/quantizer
+      (int)quant[1] >> quant_shift,  // scale/quantizer
+      // TODO(yushin): Instead of 0,
+      //   use daala_enc->use_activity_masking for activity masking.
+      plane, tx_size, OD_PVQ_BETA[0][plane][tx_size],
+      0,        // is_keyframe,
+      0, 0, 0,  // q_scaling, bx, by,
+      daala_enc->state.qm + off, daala_enc->state.qm_inv + off,
+      speed,  // speed
+      pvq_info);
+  if (skip && pvq_info) assert(pvq_info->ac_dc_coded == 0);
+  if (!skip && pvq_info) assert(pvq_info->ac_dc_coded > 0);
+  // Encode residue of DC coeff, if required.
+  if (!has_dc_skip || out_int32[0]) {
+    generic_encode(&daala_enc->ec, &daala_enc->state.adapt.model_dc[plane],
+                   abs(out_int32[0]) - has_dc_skip, -1,
+                   &daala_enc->state.adapt.ex_dc[plane][tx_size][0], 2);
+  }
+  if (out_int32[0]) {
+    od_ec_enc_bits(&daala_enc->ec, out_int32[0] < 0, 1);
+    skip = 0;
+  }
+  // need to save quantized residue of DC coeff
+  // so that final pvq bitstream writing can know whether DC is coded.
+  if (pvq_info) pvq_info->dq_dc_residue = out_int32[0];
+  out_int32[0] = out_int32[0] * pvq_dc_quant;
+  out_int32[0] += ref_int32[0];
+  // copy int32 result back to int16
+  for (i = 0; i < tx_blk_size * tx_blk_size; i++) dqcoeff_pvq[i] = out_int32[i];
+  // Back to original coefficient order
+  od_coding_order_to_raster(dqcoeff, tx_blk_size, tx_type, dqcoeff_pvq,
+                            tx_blk_size);
+  *eob = tx_blk_size * tx_blk_size;
+  *rate = (od_ec_enc_tell_frac(&daala_enc->ec) - tell)
+          << (AV1_PROB_COST_SHIFT - OD_BITRES);
+  assert(*rate >= 0);
+  if (plane != 0) daala_enc->pvq_norm_lambda = save_pvq_lambda;
+  return skip;
+void av1_store_pvq_enc_info(PVQ_INFO *pvq_info, int *qg, int *theta,
+                            int *max_theta, int *k, od_coeff *y, int nb_bands,
+                            const int *off, int *size, int skip_rest,
+                            int skip_dir,
+                            int bs) {  // block size in log_2 -2
+  int i;
+  const int tx_blk_size = tx_size_1d[bs];
+  for (i = 0; i < nb_bands; i++) {
+    pvq_info->qg[i] = qg[i];
+    pvq_info->theta[i] = theta[i];
+    pvq_info->max_theta[i] = max_theta[i];
+    pvq_info->k[i] = k[i];
+    pvq_info->off[i] = off[i];
+    pvq_info->size[i] = size[i];
+  }
+  memcpy(pvq_info->y, y, tx_blk_size * tx_blk_size * sizeof(od_coeff));
+  pvq_info->nb_bands = nb_bands;
+  pvq_info->skip_rest = skip_rest;
+  pvq_info->skip_dir = skip_dir;
+  pvq_info->bs = bs;
diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index 2576b1a..c5f2ac1 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h
@@ -43,6 +43,19 @@
 void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,
                                   BLOCK_SIZE bsize, int plane);
+int av1_pvq_encode_helper(daala_enc_ctx *daala_enc, tran_low_t *const coeff,
+                          tran_low_t *ref_coeff, tran_low_t *const dqcoeff,
+                          uint16_t *eob, const int16_t *quant, int plane,
+                          int tx_size, TX_TYPE tx_type, int *rate, int speed,
+                          PVQ_INFO *pvq_info);
+void av1_store_pvq_enc_info(PVQ_INFO *pvq_info, int *qg, int *theta,
+                            int *max_theta, int *k, od_coeff *y, int nb_bands,
+                            const int *off, int *size, int skip_rest,
+                            int skip_dir, int bs);
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 0b6410a..13f54c9 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -373,6 +373,20 @@
   cpi->mbmi_ext_base = NULL;
+  if (cpi->oxcf.pass != 1) {
+    const int tile_cols = 1 << cm->log2_tile_cols;
+    const int tile_rows = 1 << cm->log2_tile_rows;
+    int tile_col, tile_row;
+    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+        TileDataEnc *tile_data =
+            &cpi->tile_data[tile_row * tile_cols + tile_col];
+        aom_free(tile_data->pvq_q.buf);
+      }
+  }
   cpi->tile_data = NULL;
@@ -727,7 +741,11 @@
   av1_set_mb_mi(cm, cm->width, cm->height);
-  av1_init_macroblockd(cm, xd, NULL);
+  av1_init_macroblockd(cm, xd,
+                       NULL,
+                       NULL);
   memset(cpi->mbmi_ext_base, 0,
          cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index d8a4b5f..daa90b3 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -265,6 +265,9 @@
   TileInfo tile_info;
   int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
   int mode_map[BLOCK_SIZES][MAX_MODES];
+  PVQ_QUEUE pvq_q;
 } TileDataEnc;
 typedef struct RD_COUNTS {
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 4d0acee..1dfd0a6 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -486,6 +486,9 @@
   double intra_factor;
   double brightness_factor;
   BufferPool *const pool = cm->buffer_pool;
+  PVQ_QUEUE pvq_q;
   // First pass code requires valid last and new frame buffers.
   assert(new_yv12 != NULL);
@@ -520,10 +523,43 @@
+  // For pass 1 of 2-pass encoding, init here for PVQ for now.
+  {
+    od_adapt_ctx *adapt;
+    pvq_q.buf_len = 5000;
+    CHECK_MEM_ERROR(cm, pvq_q.buf, aom_malloc(pvq_q.buf_len * sizeof(PVQ_INFO)));
+    pvq_q.curr_pos = 0;
+    x->pvq_coded = 0;
+    x->pvq_q = &pvq_q;
+    // TODO(yushin): Since this init step is also called in 2nd pass,
+    // or 1-pass encoding, consider factoring out it as a function.
+    // TODO(yushin)
+    // If activity masking is enabled, change below to OD_HVS_QM
+    x->daala_enc.qm = OD_FLAT_QM;  // Hard coded. Enc/dec required to sync.
+    x->daala_enc.pvq_norm_lambda = OD_PVQ_LAMBDA;
+    x->daala_enc.pvq_norm_lambda_dc = OD_PVQ_LAMBDA;
+    od_init_qm(x->daala_enc.state.qm, x->daala_enc.state.qm_inv,
+               x->daala_enc.qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT);
+    od_ec_enc_init(&x->, 65025);
+    adapt = &x->daala_enc.state.adapt;
+    od_ec_enc_reset(&x->;
+    od_adapt_ctx_reset(adapt, 0);
+  }
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     p[i].coeff = ctx->coeff[i];
     p[i].qcoeff = ctx->qcoeff[i];
     pd[i].dqcoeff = ctx->dqcoeff[i];
+    pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i];
     p[i].eobs = ctx->eobs[i];
@@ -912,6 +948,16 @@
+  od_ec_enc_clear(&x->;
+  x->pvq_q->last_pos = x->pvq_q->curr_pos;
+  x->pvq_q->curr_pos = 0;
+  x->pvq_q = NULL;
+  aom_free(pvq_q.buf);
   // Clamp the image start to rows/2. This number of rows is discarded top
   // and bottom as dead data so rows / 2 means the frame is blank.
   if ((image_data_start_row > cm->mb_rows / 2) ||
diff --git a/av1/encoder/generic_encoder.c b/av1/encoder/generic_encoder.c
new file mode 100644
index 0000000..466ede3
--- /dev/null
+++ b/av1/encoder/generic_encoder.c
@@ -0,0 +1,200 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+/* clang-format off */
+# include "config.h"
+#include <stdio.h>
+#include "aom_dsp/entdec.h"
+#include "aom_dsp/entenc.h"
+#include "av1/common/generic_code.h"
+#include "av1/common/odintrin.h"
+#include "pvq_encoder.h"
+/** Encodes a value from 0 to N-1 (with N up to 16) based on a cdf and adapts
+ * the cdf accordingly.
+ *
+ * @param [in,out] enc   range encoder
+ * @param [in]     val   variable being encoded
+ * @param [in,out] cdf   CDF of the variable (Q15)
+ * @param [in]     n     number of values possible
+ * @param [in,out] count number of symbols encoded with that cdf so far
+ * @param [in]     rate  adaptation rate shift (smaller is faster)
+ */
+void od_encode_cdf_adapt_q15(od_ec_enc *ec, int val, uint16_t *cdf, int n,
+ int *count, int rate) {
+  int i;
+  if (*count == 0) {
+    /* On the first call, we normalize the cdf to (32768 - n). This should
+       eventually be moved to the state init, but for now it makes it much
+       easier to experiment and convert symbols to the Q15 adaptation.*/
+    int ft;
+    ft = cdf[n - 1];
+    for (i = 0; i < n; i++) {
+      cdf[i] = cdf[i]*32768/ft;
+    }
+  }
+  od_ec_encode_cdf_q15(ec, val, cdf, n);
+  od_cdf_adapt_q15(val, cdf, n, count, rate);
+/** Encodes a value from 0 to N-1 (with N up to 16) based on a cdf and adapts
+ * the cdf accordingly.
+ *
+ * @param [in,out] enc   range encoder
+ * @param [in]     val   variable being encoded
+ * @param [in]     cdf   CDF of the variable (Q15)
+ * @param [in]     n     number of values possible
+ * @param [in]     increment adaptation speed (Q15)
+ */
+void od_encode_cdf_adapt(od_ec_enc *ec, int val, uint16_t *cdf, int n,
+ int increment) {
+  int i;
+  od_ec_encode_cdf_unscaled(ec, val, cdf, n);
+  if (cdf[n-1] + increment > 32767) {
+    for (i = 0; i < n; i++) {
+      /* Second term ensures that the pdf is non-null */
+      cdf[i] = (cdf[i] >> 1) + i + 1;
+    }
+  }
+  for (i = val; i < n; i++) cdf[i] += increment;
+/** Encodes a random variable using a "generic" model, assuming that the
+ * distribution is one-sided (zero and up), has a single mode, and decays
+ * exponentially past the model.
+ *
+ * @param [in,out] enc   range encoder
+ * @param [in,out] model generic probability model
+ * @param [in]     x     variable being encoded
+ * @param [in]     max   largest value possible
+ * @param [in,out] ExQ16 expectation of x (adapted)
+ * @param [in]     integration integration period of ExQ16 (leaky average over
+ * 1<<integration samples)
+ */
+void generic_encode(od_ec_enc *enc, generic_encoder *model, int x, int max,
+ int *ex_q16, int integration) {
+  int lg_q1;
+  int shift;
+  int id;
+  uint16_t *cdf;
+  int xs;
+  int ms;
+  if (max == 0) return;
+  lg_q1 = log_ex(*ex_q16);
+   "%d %d", *ex_q16, lg_q1));
+  /* If expectation is too large, shift x to ensure that
+     all we have past xs=15 is the exponentially decaying tail
+     of the distribution */
+  shift = OD_MAXI(0, (lg_q1 - 5) >> 1);
+  /* Choose the cdf to use: we have two per "octave" of ExQ16 */
+  id = OD_MINI(GENERIC_TABLES - 1, lg_q1);
+  cdf = model->cdf[id];
+  xs = (x + (1 << shift >> 1)) >> shift;
+  ms = (max + (1 << shift >> 1)) >> shift;
+  OD_ASSERT(max == -1 || xs <= ms);
+  if (max == -1) od_ec_encode_cdf_unscaled(enc, OD_MINI(15, xs), cdf, 16);
+  else {
+    od_ec_encode_cdf_unscaled(enc, OD_MINI(15, xs), cdf, OD_MINI(ms + 1, 16));
+  }
+  if (xs >= 15) {
+    int e;
+    unsigned decay;
+    /* Estimate decay based on the assumption that the distribution is close
+       to Laplacian for large values. We should probably have an adaptive
+       estimate instead. Note: The 2* is a kludge that's not fully understood
+       yet. */
+    OD_ASSERT(*ex_q16 < INT_MAX >> 1);
+    e = ((2**ex_q16 >> 8) + (1 << shift >> 1)) >> shift;
+    decay = OD_MAXI(2, OD_MINI(254, 256*e/(e + 256)));
+    /* Encode the tail of the distribution assuming exponential decay. */
+    od_laplace_encode_special(enc, xs - 15, decay, (max == -1) ? -1 : ms - 15);
+  }
+  if (shift != 0) {
+    int special;
+    /* Because of the rounding, there's only half the number of possibilities
+       for xs=0. */
+    special = xs == 0;
+    if (shift - special > 0) {
+      od_ec_enc_bits(enc, x - (xs << shift) + (!special << (shift - 1)),
+       shift - special);
+    }
+  }
+  generic_model_update(model, ex_q16, x, xs, id, integration);
+   "enc: %d %d %d %d %d %x", *ex_q16, x, shift, id, xs, enc->rng));
+/** Estimates the cost of encoding a value with generic_encode().
+ *
+ * @param [in,out] model generic probability model
+ * @param [in]     x     variable being encoded
+ * @param [in]     max   largest value possible
+ * @param [in,out] ExQ16 expectation of x (adapted)
+ * @return number of bits (approximation)
+ */
+double generic_encode_cost(generic_encoder *model, int x, int max,
+ int *ex_q16) {
+  int lg_q1;
+  int shift;
+  int id;
+  uint16_t *cdf;
+  int xs;
+  int ms;
+  int extra;
+  if (max == 0) return 0;
+  lg_q1 = log_ex(*ex_q16);
+  /* If expectation is too large, shift x to ensure that
+       all we have past xs=15 is the exponentially decaying tail
+       of the distribution */
+  shift = OD_MAXI(0, (lg_q1 - 5) >> 1);
+  /* Choose the cdf to use: we have two per "octave" of ExQ16 */
+  id = OD_MINI(GENERIC_TABLES - 1, lg_q1);
+  cdf = model->cdf[id];
+  xs = (x + (1 << shift >> 1)) >> shift;
+  ms = (max + (1 << shift >> 1)) >> shift;
+  OD_ASSERT(max == -1 || xs <= ms);
+  extra = 0;
+  if (shift) extra = shift - (xs == 0);
+  xs = OD_MINI(15, xs);
+  /* Shortcut: assume it's going to cost 2 bits for the Laplace coder. */
+  if (xs == 15) extra += 2;
+  if (max == -1) {
+    return extra - OD_LOG2((double)(cdf[xs] - (xs == 0 ? 0 : cdf[xs - 1]))/
+     cdf[15]);
+  }
+  else {
+    return extra - OD_LOG2((double)(cdf[xs] - (xs == 0 ? 0 : cdf[xs - 1]))/
+     cdf[OD_MINI(ms, 15)]);
+  }
+/*Estimates the cost of encoding a value with a given CDF.*/
+double od_encode_cdf_cost(int val, uint16_t *cdf, int n) {
+  int total_prob;
+  int prev_prob;
+  double val_prob;
+  OD_ASSERT(n > 0);
+  total_prob = cdf[n - 1];
+  if (val == 0) {
+    prev_prob = 0;
+  }
+  else {
+    prev_prob = cdf[val - 1];
+  }
+  val_prob = (cdf[val] - prev_prob) / (double)total_prob;
+  return -OD_LOG2(val_prob);
diff --git a/av1/encoder/laplace_encoder.c b/av1/encoder/laplace_encoder.c
new file mode 100644
index 0000000..07dcaca
--- /dev/null
+++ b/av1/encoder/laplace_encoder.c
@@ -0,0 +1,292 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+/* clang-format off */
+# include "config.h"
+#include <stdio.h>
+#include "aom_dsp/entdec.h"
+#include "aom_dsp/entenc.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/pvq.h"
+#include "pvq_encoder.h"
+static void od_encode_pvq_split(od_ec_enc *ec, od_pvq_codeword_ctx *adapt,
+ int count, int sum, int ctx) {
+  int shift;
+  int rest;
+  int fctx;
+  if (sum == 0) return;
+  shift = OD_MAXI(0, OD_ILOG(sum) - 3);
+  if (shift) {
+    rest = count & ((1 << shift) - 1);
+    count >>= shift;
+    sum >>= shift;
+  }
+  fctx = 7*ctx + sum - 1;
+  od_encode_cdf_adapt(ec, count, adapt->pvq_split_cdf[fctx],
+   sum + 1, adapt->pvq_split_increment);
+  if (shift) od_ec_enc_bits(ec, rest, shift);
+void od_encode_band_pvq_splits(od_ec_enc *ec, od_pvq_codeword_ctx *adapt,
+ const int *y, int n, int k, int level) {
+  int mid;
+  int i;
+  int count_right;
+  if (n <= 1 || k == 0) return;
+  if (k == 1 && n <= 16) {
+    int cdf_id;
+    int pos;
+    cdf_id = od_pvq_k1_ctx(n, level == 0);
+    for (pos = 0; !y[pos]; pos++);
+    OD_ASSERT(pos < n);
+    od_encode_cdf_adapt(ec, pos, adapt->pvq_k1_cdf[cdf_id], n,
+     adapt->pvq_k1_increment);
+  }
+  else {
+    mid = n >> 1;
+    count_right = k;
+    for (i = 0; i < mid; i++) count_right -= abs(y[i]);
+    od_encode_pvq_split(ec, adapt, count_right, k, od_pvq_size_ctx(n));
+    od_encode_band_pvq_splits(ec, adapt, y, mid, k - count_right, level + 1);
+    od_encode_band_pvq_splits(ec, adapt, y + mid, n - mid, count_right,
+     level + 1);
+  }
+/** Encodes the tail of a Laplace-distributed variable, i.e. it doesn't
+ * do anything special for the zero case.
+ *
+ * @param [in,out] enc     range encoder
+ * @param [in]     x       variable to encode (has to be positive)
+ * @param [in]     decay   decay factor of the distribution in Q8 format,
+ * i.e. pdf ~= decay^x
+ * @param [in]     max     maximum possible value of x (used to truncate
+ * the pdf)
+ */
+void od_laplace_encode_special(od_ec_enc *enc, int x, unsigned decay, int max) {
+  int shift;
+  int xs;
+  int ms;
+  int sym;
+  const uint16_t *cdf;
+  shift = 0;
+  if (max == 0) return;
+  /* We don't want a large decay value because that would require too many
+     symbols. However, it's OK if the max is below 15. */
+  while (((max >> shift) >= 15 || max == -1) && decay > 235) {
+    decay = (decay*decay + 128) >> 8;
+    shift++;
+  }
+  OD_ASSERT(x <= max || max == -1);
+  decay = OD_MINI(decay, 254);
+  decay = OD_MAXI(decay, 2);
+  xs = x >> shift;
+  ms = max >> shift;
+  cdf = EXP_CDF_TABLE[(decay + 1) >> 1];
+  OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "decay = %d", decay));
+  do {
+    sym = OD_MINI(xs, 15);
+    {
+      int i;
+      OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "%d %d %d %d %d\n", x, xs, shift,
+       sym, max));
+      for (i = 0; i < 16; i++) {
+        OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "%d ", cdf[i]));
+      }
+    }
+    if (ms > 0 && ms < 15) {
+      /* Simple way of truncating the pdf when we have a bound */
+      od_ec_encode_cdf_unscaled(enc, sym, cdf, ms + 1);
+    }
+    else {
+      od_ec_encode_cdf_q15(enc, sym, cdf, 16);
+    }
+    xs -= 15;
+    ms -= 15;
+  }
+  while (sym >= 15 && ms != 0);
+  if (shift) od_ec_enc_bits(enc, x & ((1 << shift) - 1), shift);
+/** Encodes a Laplace-distributed variable for use in PVQ
+ *
+ * @param [in,out] enc  range encoder
+ * @param [in]     x    variable to encode (including sign)
+ * @param [in]     ExQ8 expectation of the absolute value of x in Q8
+ * @param [in]     K    maximum value of |x|
+ */
+void od_laplace_encode(od_ec_enc *enc, int x, int ex_q8, int k) {
+  int j;
+  int shift;
+  int xs;
+  uint16_t cdf[16];
+  int sym;
+  int decay;
+  int offset;
+  /* shift down x if expectation is too high */
+  shift = OD_ILOG(ex_q8) - 11;
+  if (shift < 0) shift = 0;
+  /* Apply the shift with rounding to Ex, K and xs */
+  ex_q8 = (ex_q8 + (1 << shift >> 1)) >> shift;
+  k = (k + (1 << shift >> 1)) >> shift;
+  xs = (x + (1 << shift >> 1)) >> shift;
+  decay = OD_MINI(254, 256*ex_q8/(ex_q8 + 256));
+  offset = LAPLACE_OFFSET[(decay + 1) >> 1];
+  for (j = 0; j < 16; j++) {
+    cdf[j] = EXP_CDF_TABLE[(decay + 1) >> 1][j] - offset;
+  }
+  sym = xs;
+  if (sym > 15) sym = 15;
+  /* Simple way of truncating the pdf when we have a bound */
+  if (k != 0) od_ec_encode_cdf_unscaled(enc, sym, cdf, OD_MINI(k + 1, 16));
+  if (shift) {
+    int special;
+    /* Because of the rounding, there's only half the number of possibilities
+       for xs=0 */
+    special = xs == 0;
+    if (shift - special > 0) {
+      od_ec_enc_bits(enc, x - (xs << shift) + (!special << (shift - 1)),
+       shift - special);
+    }
+  }
+  /* Handle the exponentially-decaying tail of the distribution */
+  OD_ASSERT(xs - 15 <= k - 15);
+  if (xs >= 15) od_laplace_encode_special(enc, xs - 15, decay, k - 15);
+static void laplace_encode_vector_delta(od_ec_enc *enc, const od_coeff *y, int n, int k,
+                                        int32_t *curr, const int32_t *means) {
+  int i;
+  int prev;
+  int sum_ex;
+  int sum_c;
+  int first;
+  int k_left;
+  int coef;
+  prev = 0;
+  sum_ex = 0;
+  sum_c = 0;
+  first = 1;
+  k_left = k;
+  coef = 256*means[OD_ADAPT_COUNT_Q8]/
+   (1 + means[OD_ADAPT_COUNT_EX_Q8]);
+  coef = OD_MAXI(coef, 1);
+  for (i = 0; i < n; i++) {
+    if (y[i] != 0) {
+      int j;
+      int count;
+      int mag;
+      mag = abs(y[i]);
+      count = i - prev;
+      if (first) {
+        int decay;
+        int ex = coef*(n - prev)/k_left;
+        if (ex > 65280) decay = 255;
+        else {
+          decay = OD_MINI(255,
+           (int)((256*ex/(ex + 256) + (ex>>5)*ex/((n + 1)*(n - 1)*(n - 1)))));
+        }
+        /*Update mean position.*/
+        OD_ASSERT(count <= n - 1);
+        od_laplace_encode_special(enc, count, decay, n - 1);
+        first = 0;
+      }
+      else od_laplace_encode(enc, count, coef*(n - prev)/k_left, n - prev - 1);
+      sum_ex += 256*(n - prev);
+      sum_c += count*k_left;
+      od_ec_enc_bits(enc, y[i] < 0, 1);
+      for (j = 0; j < mag - 1; j++) {
+        od_laplace_encode(enc, 0, coef*(n - i)/(k_left - 1 - j), n - i - 1);
+        sum_ex += 256*(n - i);
+      }
+      k_left -= mag;
+      prev = i;
+      if (k_left == 0) break;
+    }
+  }
+  if (k > 0) {
+    curr[OD_ADAPT_COUNT_Q8] = 256*sum_c;
+    curr[OD_ADAPT_COUNT_EX_Q8] = sum_ex;
+  }
+  else {
+  }
+  curr[OD_ADAPT_K_Q8] = 0;
+  curr[OD_ADAPT_SUM_EX_Q8] = 0;
+/** Encodes a vector of integers assumed to come from rounding a sequence of
+ * Laplace-distributed real values in decreasing order of variance.
+ *
+ * @param [in,out] enc range encoder
+ * @param [in]     y     vector to encode
+ * @param [in]     N     dimension of the vector
+ * @param [in]     K     sum of the absolute value of components of y
+ * @param [out]    curr  Adaptation context output, may alias means.
+ * @param [in]     means Adaptation context input.
+ */
+void od_laplace_encode_vector(od_ec_enc *enc, const od_coeff *y, int n, int k,
+                           int32_t *curr, const int32_t *means) {
+  int i;
+  int sum_ex;
+  int kn;
+  int exp_q8;
+  int mean_k_q8;
+  int mean_sum_ex_q8;
+  int ran_delta;
+  ran_delta = 0;
+  if (k <= 1) {
+    laplace_encode_vector_delta(enc, y, n, k, curr, means);
+    return;
+  }
+  sum_ex = 0;
+  kn = k;
+  /* Estimates the factor relating pulses_left and positions_left to E(|x|) */
+  mean_k_q8 = means[OD_ADAPT_K_Q8];
+  mean_sum_ex_q8 = means[OD_ADAPT_SUM_EX_Q8];
+  if (mean_k_q8 < 1 << 23) exp_q8 = 256*mean_k_q8/(1 + mean_sum_ex_q8);
+  else exp_q8 = mean_k_q8/(1 + (mean_sum_ex_q8 >> 8));
+  for (i = 0; i < n; i++) {
+    int ex;
+    int x;
+    if (kn == 0) break;
+    if (kn <= 1 && i != n - 1) {
+      laplace_encode_vector_delta(enc, y + i, n - i, kn, curr, means);
+      ran_delta = 1;
+      break;
+    }
+    x = abs(y[i]);
+    /* Expected value of x (round-to-nearest) is
+       expQ8*pulses_left/positions_left */
+    ex = (2*exp_q8*kn + (n - i))/(2*(n - i));
+    if (ex > kn*256) ex = kn*256;
+    sum_ex += (2*256*kn + (n - i))/(2*(n - i));
+    /* No need to encode the magnitude for the last bin. */
+    if (i != n - 1) od_laplace_encode(enc, x, ex, kn);
+    if (x != 0) od_ec_enc_bits(enc, y[i] < 0, 1);
+    kn -= x;
+  }
+  /* Adapting the estimates for expQ8 */
+  if (!ran_delta) {
+  }
+  curr[OD_ADAPT_K_Q8] = k - kn;
+  curr[OD_ADAPT_SUM_EX_Q8] = sum_ex;
diff --git a/av1/encoder/pvq_encoder.c b/av1/encoder/pvq_encoder.c
new file mode 100644
index 0000000..b0ee102
--- /dev/null
+++ b/av1/encoder/pvq_encoder.c
@@ -0,0 +1,1015 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+/* clang-format off */
+# include "config.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "aom_dsp/entcode.h"
+#include "aom_dsp/entenc.h"
+#include "av1/common/blockd.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/partition.h"
+#include "av1/common/pvq_state.h"
+#include "av1/encoder/encodemb.h"
+#include "pvq_encoder.h"
+#define OD_PVQ_RATE_APPROX (0)
+/*Shift to ensure that the upper bound (i.e. for the max blocksize) of the
+   dot-product of the 1st band of chroma with the luma ref doesn't overflow.*/
+static void od_encode_pvq_codeword(od_ec_enc *ec, od_pvq_codeword_ctx *adapt,
+ const od_coeff *in, int n, int k) {
+  int i;
+  od_encode_band_pvq_splits(ec, adapt, in, n, k, 0);
+  for (i = 0; i < n; i++) if (in[i]) od_ec_enc_bits(ec, in[i] < 0, 1);
+/* Computes 1/sqrt(i) using a table for small values. */
+static double od_rsqrt_table(int i) {
+  static double table[16] = {
+    1.000000, 0.707107, 0.577350, 0.500000,
+    0.447214, 0.408248, 0.377964, 0.353553,
+    0.333333, 0.316228, 0.301511, 0.288675,
+    0.277350, 0.267261, 0.258199, 0.250000};
+  if (i <= 16) return table[i-1];
+  else return 1./sqrt(i);
+/*Computes 1/sqrt(start+2*i+1) using a lookup table containing the results
+   where 0 <= i < table_size.*/
+static double od_custom_rsqrt_dynamic_table(const double* table,
+ const int table_size, const double start, const int i) {
+  if (i < table_size) return table[i];
+  else return od_rsqrt_table(start + 2*i + 1);
+/*Fills tables used in od_custom_rsqrt_dynamic_table for a given start.*/
+static void od_fill_dynamic_rqrt_table(double *table, const int table_size,
+ const double start) {
+  int i;
+  for (i = 0; i < table_size; i++)
+    table[i] = od_rsqrt_table(start + 2*i + 1);
+/** Find the codepoint on the given PSphere closest to the desired
+ * vector. Double-precision PVQ search just to make sure our tests
+ * aren't limited by numerical accuracy.
+ *
+ * @param [in]      xcoeff  input vector to quantize (x in the math doc)
+ * @param [in]      n       number of dimensions
+ * @param [in]      k       number of pulses
+ * @param [out]     ypulse  optimal codevector found (y in the math doc)
+ * @param [out]     g2      multiplier for the distortion (typically squared
+ *                          gain units)
+ * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
+ * @param [in]      prev_k  number of pulses already in ypulse that we should
+ *                          reuse for the search (or 0 for a new search)
+ * @return                  cosine distance between x and y (between 0 and 1)
+ */
+static double pvq_search_rdo_double(const od_val16 *xcoeff, int n, int k,
+ od_coeff *ypulse, double g2, double pvq_norm_lambda, int prev_k) {
+  int i, j;
+  double xy;
+  double yy;
+  /* TODO - This blows our 8kB stack space budget and should be fixed when
+   converting PVQ to fixed point. */
+  double x[MAXN];
+  double xx;
+  double lambda;
+  double norm_1;
+  int rdo_pulses;
+  double delta_rate;
+  xx = xy = yy = 0;
+  for (j = 0; j < n; j++) {
+    x[j] = fabs((float)xcoeff[j]);
+    xx += x[j]*x[j];
+  }
+  norm_1 = 1./sqrt(1e-30 + xx);
+  lambda = pvq_norm_lambda/(1e-30 + g2);
+  i = 0;
+  if (prev_k > 0 && prev_k <= k) {
+    /* We reuse pulses from a previous search so we don't have to search them
+       again. */
+    for (j = 0; j < n; j++) {
+      ypulse[j] = abs(ypulse[j]);
+      xy += x[j]*ypulse[j];
+      yy += ypulse[j]*ypulse[j];
+      i += ypulse[j];
+    }
+  }
+  else if (k > 2) {
+    double l1_norm;
+    double l1_inv;
+    l1_norm = 0;
+    for (j = 0; j < n; j++) l1_norm += x[j];
+    l1_inv = 1./OD_MAXF(l1_norm, 1e-100);
+    for (j = 0; j < n; j++) {
+      double tmp;
+      tmp = k*x[j]*l1_inv;
+      ypulse[j] = OD_MAXI(0, (int)floor(tmp));
+      xy += x[j]*ypulse[j];
+      yy += ypulse[j]*ypulse[j];
+      i += ypulse[j];
+    }
+  }
+  else OD_CLEAR(ypulse, n);
+  /* Only use RDO on the last few pulses. This not only saves CPU, but using
+     RDO on all pulses actually makes the results worse for reasons I don't
+     fully understand. */
+  rdo_pulses = 1 + k/4;
+  /* Rough assumption for now, the last position costs about 3 bits more than
+     the first. */
+  delta_rate = 3./n;
+  /* Search one pulse at a time */
+  for (; i < k - rdo_pulses; i++) {
+    int pos;
+    double best_xy;
+    double best_yy;
+    pos = 0;
+    best_xy = -10;
+    best_yy = 1;
+    for (j = 0; j < n; j++) {
+      double tmp_xy;
+      double tmp_yy;
+      tmp_xy = xy + x[j];
+      tmp_yy = yy + 2*ypulse[j] + 1;
+      tmp_xy *= tmp_xy;
+      if (j == 0 || tmp_xy*best_yy > best_xy*tmp_yy) {
+        best_xy = tmp_xy;
+        best_yy = tmp_yy;
+        pos = j;
+      }
+    }
+    xy = xy + x[pos];
+    yy = yy + 2*ypulse[pos] + 1;
+    ypulse[pos]++;
+  }
+  /* Search last pulses with RDO. Distortion is D = (x-y)^2 = x^2 - 2*x*y + y^2
+     and since x^2 and y^2 are constant, we just maximize x*y, plus a
+     lambda*rate term. Note that since x and y aren't normalized here,
+     we need to divide by sqrt(x^2)*sqrt(y^2). */
+  for (; i < k; i++) {
+    double rsqrt_table[4];
+    int rsqrt_table_size = 4;
+    int pos;
+    double best_cost;
+    pos = 0;
+    best_cost = -1e5;
+    /*Fill the small rsqrt lookup table with inputs relative to yy.
+      Specifically, the table of n values is filled with
+       rsqrt(yy + 1), rsqrt(yy + 2 + 1) .. rsqrt(yy + 2*(n-1) + 1).*/
+    od_fill_dynamic_rqrt_table(rsqrt_table, rsqrt_table_size, yy);
+    for (j = 0; j < n; j++) {
+      double tmp_xy;
+      double tmp_yy;
+      tmp_xy = xy + x[j];
+      /*Calculate rsqrt(yy + 2*ypulse[j] + 1) using an optimized method.*/
+      tmp_yy = od_custom_rsqrt_dynamic_table(rsqrt_table, rsqrt_table_size,
+       yy, ypulse[j]);
+      tmp_xy = 2*tmp_xy*norm_1*tmp_yy - lambda*j*delta_rate;
+      if (j == 0 || tmp_xy > best_cost) {
+        best_cost = tmp_xy;
+        pos = j;
+      }
+    }
+    xy = xy + x[pos];
+    yy = yy + 2*ypulse[pos] + 1;
+    ypulse[pos]++;
+  }
+  for (i = 0; i < n; i++) {
+    if (xcoeff[i] < 0) ypulse[i] = -ypulse[i];
+  }
+  return xy/(1e-100 + sqrt(xx*yy));
+/** Encodes the gain so that the return value increases with the
+ * distance |x-ref|, so that we can encode a zero when x=ref. The
+ * value x=0 is not covered because it is only allowed in the noref
+ * case.
+ *
+ * @param [in]      x      quantized gain to encode
+ * @param [in]      ref    quantized gain of the reference
+ * @return                 interleave-encoded quantized gain value
+ */
+static int neg_interleave(int x, int ref) {
+  if (x < ref) return -2*(x - ref) - 1;
+  else if (x < 2*ref) return 2*(x - ref);
+  else return x-1;
+int od_vector_is_null(const od_coeff *x, int len) {
+  int i;
+  for (i = 0; i < len; i++) if (x[i]) return 0;
+  return 1;
+static double od_pvq_rate(int qg, int icgr, int theta, int ts,
+ const od_adapt_ctx *adapt, const od_coeff *y0, int k, int n,
+ int is_keyframe, int pli, int speed) {
+  double rate;
+  if (k == 0) rate = 0;
+  else if (speed > 0) {
+    int i;
+    int sum;
+    double f;
+    /* Compute "center of mass" of the pulse vector. */
+    sum = 0;
+    for (i = 0; i < n - (theta != -1); i++) sum += i*abs(y0[i]);
+    f = sum/(double)(k*n);
+    /* Estimates the number of bits it will cost to encode K pulses in
+       N dimensions based on hand-tuned fit for bitrate vs K, N and
+       "center of mass". */
+    rate = (1 + .4*f)*n*OD_LOG2(1 + OD_MAXF(0, log(n*2*(1*f + .025))*k/n)) + 3;
+  }
+  else {
+    od_ec_enc ec;
+    od_pvq_codeword_ctx cd;
+    int tell;
+    od_ec_enc_init(&ec, 1000);
+    OD_COPY(&cd, &adapt->pvq.pvq_codeword_ctx, 1);
+    tell = od_ec_enc_tell_frac(&ec);
+    od_encode_pvq_codeword(&ec, &cd, y0, n - (theta != -1), k);
+    rate = (od_ec_enc_tell_frac(&ec)-tell)/8.;
+    od_ec_enc_clear(&ec);
+  }
+  if (qg > 0 && theta >= 0) {
+    /* Approximate cost of entropy-coding theta */
+    rate += .9*OD_LOG2(ts);
+    /* Adding a cost to using the H/V pred because it's going to be off
+       most of the time. Cost is optimized on subset1, while making
+       sure we don't hurt the checkerboard image too much.
+       FIXME: Do real RDO instead of this arbitrary cost. */
+    if (is_keyframe && pli == 0) rate += 6;
+    if (qg == icgr) rate -= .5;
+  }
+  return rate;
+#define MAX_PVQ_ITEMS (20)
+/* This stores the information about a PVQ search candidate, so we can sort
+   based on K. */
+typedef struct {
+  int gain;
+  int k;
+  od_val32 qtheta;
+  int theta;
+  int ts;
+  od_val32 qcg;
+} pvq_search_item;
+int items_compare(pvq_search_item *a, pvq_search_item *b) {
+  return a->k - b->k;
+/** Perform PVQ quantization with prediction, trying several
+ * possible gains and angles. See draft-valin-videocodec-pvq and
+ * for more details.
+ *
+ * @param [out]    out       coefficients after quantization
+ * @param [in]     x0        coefficients before quantization
+ * @param [in]     r0        reference, aka predicted coefficients
+ * @param [in]     n         number of dimensions
+ * @param [in]     q0        quantization step size
+ * @param [out]    y         pulse vector (i.e. selected PVQ codevector)
+ * @param [out]    itheta    angle between input and reference (-1 if noref)
+ * @param [out]    max_theta maximum value of itheta that could have been
+ * @param [out]    vk        total number of pulses
+ * @param [in]     beta      per-band activity masking beta param
+ * @param [out]    skip_diff distortion cost of skipping this block
+ *                           (accumulated)
+ * @param [in]     robust    make stream robust to error in the reference
+ * @param [in]     is_keyframe whether we're encoding a keyframe
+ * @param [in]     pli       plane index
+ * @param [in]     adapt     probability adaptation context
+ * @param [in]     qm        QM with magnitude compensation
+ * @param [in]     qm_inv    Inverse of QM with magnitude compensation
+ * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
+ * @param [in]     speed     Make search faster by making approximations
+ * @return         gain      index of the quatized gain
+static int pvq_theta(od_coeff *out, const od_coeff *x0, const od_coeff *r0,
+ int n, int q0, od_coeff *y, int *itheta, int *max_theta, int *vk,
+ od_val16 beta, double *skip_diff, int robust, int is_keyframe, int pli,
+ const od_adapt_ctx *adapt, const int16_t *qm,
+ const int16_t *qm_inv, double pvq_norm_lambda, int speed) {
+  od_val32 g;
+  od_val32 gr;
+  od_coeff y_tmp[MAXN];
+  int i;
+  /* Number of pulses. */
+  int k;
+  /* Companded gain of x and reference, normalized to q. */
+  od_val32 cg;
+  od_val32 cgr;
+  int icgr;
+  int qg;
+  /* Best RDO cost (D + lamdba*R) so far. */
+  double best_cost;
+  double dist0;
+  /* Distortion (D) that corresponds to the best RDO cost. */
+  double best_dist;
+  double dist;
+  /* Sign of Householder reflection. */
+  int s;
+  /* Dimension on which Householder reflects. */
+  int m;
+  od_val32 theta;
+  double corr;
+  int best_k;
+  od_val32 best_qtheta;
+  od_val32 gain_offset;
+  int noref;
+  double skip_dist;
+  int cfl_enabled;
+  int skip;
+  double gain_weight;
+  od_val16 x16[MAXN];
+  od_val16 r16[MAXN];
+  int xshift;
+  int rshift;
+  /* Give more weight to gain error when calculating the total distortion. */
+  gain_weight = 1.0;
+  OD_ASSERT(n > 1);
+  corr = 0;
+#if !defined(OD_FLOAT_PVQ)
+  /* Shift needed to make x fit in 16 bits even after rotation.
+     This shift value is not normative (it can be changed without breaking
+     the bitstream) */
+  xshift = OD_MAXI(0, od_vector_log_mag(x0, n) - 15);
+  /* Shift needed to make the reference fit in 15 bits, so that the Householder
+     vector can fit in 16 bits.
+     This shift value *is* normative, and has to match the decoder. */
+  rshift = OD_MAXI(0, od_vector_log_mag(r0, n) - 14);
+  xshift = 0;
+  rshift = 0;
+  for (i = 0; i < n; i++) {
+#if defined(OD_FLOAT_PVQ)
+    /*This is slightly different from the original float PVQ code,
+       where the qm was applied in the accumulation in od_pvq_compute_gain and
+       the vectors were od_coeffs, not od_val16 (i.e. double).*/
+    x16[i] = x0[i]*(double)qm[i]*OD_QM_SCALE_1;
+    r16[i] = r0[i]*(double)qm[i]*OD_QM_SCALE_1;
+    x16[i] = OD_SHR_ROUND(x0[i]*qm[i], OD_QM_SHIFT + xshift);
+    r16[i] = OD_SHR_ROUND(r0[i]*qm[i], OD_QM_SHIFT + rshift);
+    corr += OD_MULT16_16(x16[i], r16[i]);
+  }
+  cfl_enabled = is_keyframe && pli != 0 && !OD_DISABLE_CFL;
+  cg  = od_pvq_compute_gain(x16, n, q0, &g, beta, xshift);
+  cgr = od_pvq_compute_gain(r16, n, q0, &gr, beta, rshift);
+  if (cfl_enabled) cgr = OD_CGAIN_SCALE;
+  /* gain_offset is meant to make sure one of the quantized gains has
+     exactly the same gain as the reference. */
+#if defined(OD_FLOAT_PVQ)
+  icgr = (int)floor(.5 + cgr);
+  icgr = OD_SHR_ROUND(cgr, OD_CGAIN_SHIFT);
+  gain_offset = cgr - OD_SHL(icgr, OD_CGAIN_SHIFT);
+  /* Start search with null case: gain=0, no pulse. */
+  qg = 0;
+  dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2;
+  best_dist = dist;
+  best_cost = dist + pvq_norm_lambda*od_pvq_rate(0, 0, -1, 0, adapt, NULL, 0,
+   n, is_keyframe, pli, speed);
+  noref = 1;
+  best_k = 0;
+  *itheta = -1;
+  *max_theta = 0;
+  OD_CLEAR(y, n);
+  best_qtheta = 0;
+  m = 0;
+  s = 1;
+  corr = corr/(1e-100 + g*(double)gr/OD_SHL(1, xshift + rshift));
+  corr = OD_MAXF(OD_MINF(corr, 1.), -1.);
+  if (is_keyframe) skip_dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2;
+  else {
+    skip_dist = gain_weight*(cg - cgr)*(cg - cgr)
+     + cgr*(double)cg*(2 - 2*corr);
+    skip_dist *= OD_CGAIN_SCALE_2;
+  }
+  if (!is_keyframe) {
+    /* noref, gain=0 isn't allowed, but skip is allowed. */
+    od_val32 scgr;
+    scgr = OD_MAXF(0,gain_offset);
+    if (icgr == 0) {
+      best_dist = gain_weight*(cg - scgr)*(cg - scgr)
+       + scgr*(double)cg*(2 - 2*corr);
+      best_dist *= OD_CGAIN_SCALE_2;
+    }
+    best_cost = best_dist + pvq_norm_lambda*od_pvq_rate(0, icgr, 0, 0, adapt,
+     NULL, 0, n, is_keyframe, pli, speed);
+    best_qtheta = 0;
+    *itheta = 0;
+    *max_theta = 0;
+    noref = 0;
+  }
+  dist0 = best_dist;
+  if (n <= OD_MAX_PVQ_SIZE && !od_vector_is_null(r0, n) && corr > 0) {
+    od_val16 xr[MAXN];
+    int gain_bound;
+    int prev_k;
+    pvq_search_item items[MAX_PVQ_ITEMS];
+    int idx;
+    int nitems;
+    double cos_dist;
+    idx = 0;
+    gain_bound = OD_SHR(cg - gain_offset, OD_CGAIN_SHIFT);
+    /* Perform theta search only if prediction is useful. */
+    theta = OD_ROUND32(OD_THETA_SCALE*acos(corr));
+    m = od_compute_householder(r16, n, gr, &s, rshift);
+    od_apply_householder(xr, x16, r16, n);
+    prev_k = 0;
+    for (i = m; i < n - 1; i++) xr[i] = xr[i + 1];
+    /* Compute all candidate PVQ searches within a reasonable range of gain
+       and theta. */
+    for (i = OD_MAXI(1, gain_bound - 1); i <= gain_bound + 1; i++) {
+      int j;
+      od_val32 qcg;
+      int ts;
+      int theta_lower;
+      int theta_upper;
+      /* Quantized companded gain */
+      qcg = OD_SHL(i, OD_CGAIN_SHIFT) + gain_offset;
+      /* Set angular resolution (in ra) to match the encoded gain */
+      ts = od_pvq_compute_max_theta(qcg, beta);
+      theta_lower = OD_MAXI(0, (int)floor(.5 +
+       theta*OD_THETA_SCALE_1*2/M_PI*ts) - 2);
+      theta_upper = OD_MINI(ts - 1, (int)ceil(theta*OD_THETA_SCALE_1*2/M_PI*ts));
+      /* Include the angles within a reasonable range. */
+      for (j = theta_lower; j <= theta_upper; j++) {
+        od_val32 qtheta;
+        qtheta = od_pvq_compute_theta(j, ts);
+        k = od_pvq_compute_k(qcg, j, qtheta, 0, n, beta, robust || is_keyframe);
+        items[idx].gain = i;
+        items[idx].theta = j;
+        items[idx].k = k;
+        items[idx].qcg = qcg;
+        items[idx].qtheta = qtheta;
+        items[idx].ts = ts;
+        idx++;
+        OD_ASSERT(idx < MAX_PVQ_ITEMS);
+      }
+    }
+    nitems = idx;
+    cos_dist = 0;
+    /* Sort PVQ search candidates in ascending order of pulses K so that
+       we can reuse all the previously searched pulses across searches. */
+    qsort(items, nitems, sizeof(items[0]),
+     (int (*)(const void *, const void *))items_compare);
+    /* Search for the best gain/theta in order. */
+    for (idx = 0; idx < nitems; idx++) {
+      int j;
+      od_val32 qcg;
+      int ts;
+      double cost;
+      double dist_theta;
+      double sin_prod;
+      od_val32 qtheta;
+      /* Quantized companded gain */
+      qcg = items[idx].qcg;
+      i = items[idx].gain;
+      j = items[idx].theta;
+      /* Set angular resolution (in ra) to match the encoded gain */
+      ts = items[idx].ts;
+      /* Search for the best angle within a reasonable range. */
+      qtheta = items[idx].qtheta;
+      k = items[idx].k;
+      /* Compute the minimal possible distortion by not taking the PVQ
+         cos_dist into account. */
+      dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1;
+      dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta;
+      dist *= OD_CGAIN_SCALE_2;
+      /* If we have no hope of beating skip (including a 1-bit worst-case
+         penalty), stop now. */
+      if (dist > dist0 + 1.0*pvq_norm_lambda && k != 0) continue;
+      sin_prod = od_pvq_sin(theta)*OD_TRIG_SCALE_1*od_pvq_sin(qtheta)*
+       OD_TRIG_SCALE_1;
+      /* PVQ search, using a gain of qcg*cg*sin(theta)*sin(qtheta) since
+         that's the factor by which cos_dist is multiplied to get the
+         distortion metric. */
+      if (k == 0) {
+        cos_dist = 0;
+        OD_CLEAR(y_tmp, n-1);
+      }
+      else if (k != prev_k) {
+        cos_dist = pvq_search_rdo_double(xr, n - 1, k, y_tmp,
+         qcg*(double)cg*sin_prod*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k);
+      }
+      prev_k = k;
+      /* See Jmspeex' Journal of Dubious Theoretical Results. */
+      dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1
+       + sin_prod*(2 - 2*cos_dist);
+      dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta;
+      dist *= OD_CGAIN_SCALE_2;
+      /* Do approximate RDO. */
+      cost = dist + pvq_norm_lambda*od_pvq_rate(i, icgr, j, ts, adapt, y_tmp,
+       k, n, is_keyframe, pli, speed);
+      if (cost < best_cost) {
+        best_cost = cost;
+        best_dist = dist;
+        qg = i;
+        best_k = k;
+        best_qtheta = qtheta;
+        *itheta = j;
+        *max_theta = ts;
+        noref = 0;
+        OD_COPY(y, y_tmp, n - 1);
+      }
+    }
+  }
+  /* Don't bother with no-reference version if there's a reasonable
+     correlation. The only exception is luma on a keyframe because
+     H/V prediction is unreliable. */
+  if (n <= OD_MAX_PVQ_SIZE &&
+   ((is_keyframe && pli == 0) || corr < .5
+   || cg < (od_val32)(OD_SHL(2, OD_CGAIN_SHIFT)))) {
+    int gain_bound;
+    int prev_k;
+    gain_bound = OD_SHR(cg, OD_CGAIN_SHIFT);
+    prev_k = 0;
+    /* Search for the best gain (haven't determined reasonable range yet). */
+    for (i = OD_MAXI(1, gain_bound); i <= gain_bound + 1; i++) {
+      double cos_dist;
+      double cost;
+      od_val32 qcg;
+      qcg = OD_SHL(i, OD_CGAIN_SHIFT);
+      k = od_pvq_compute_k(qcg, -1, -1, 1, n, beta, robust || is_keyframe);
+      /* Compute the minimal possible distortion by not taking the PVQ
+         cos_dist into account. */
+      dist = gain_weight*(qcg - cg)*(qcg - cg);
+      dist *= OD_CGAIN_SCALE_2;
+      if (dist > dist0 && k != 0) continue;
+      cos_dist = pvq_search_rdo_double(x16, n, k, y_tmp,
+       qcg*(double)cg*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k);
+      prev_k = k;
+      /* See Jmspeex' Journal of Dubious Theoretical Results. */
+      dist = gain_weight*(qcg - cg)*(qcg - cg)
+       + qcg*(double)cg*(2 - 2*cos_dist);
+      dist *= OD_CGAIN_SCALE_2;
+      /* Do approximate RDO. */
+      cost = dist + pvq_norm_lambda*od_pvq_rate(i, 0, -1, 0, adapt, y_tmp, k,
+       n, is_keyframe, pli, speed);
+      if (cost <= best_cost) {
+        best_cost = cost;
+        best_dist = dist;
+        qg = i;
+        noref = 1;
+        best_k = k;
+        *itheta = -1;
+        *max_theta = 0;
+        OD_COPY(y, y_tmp, n);
+      }
+    }
+  }
+  k = best_k;
+  theta = best_qtheta;
+  skip = 0;
+  if (noref) {
+    if (qg == 0) skip = OD_PVQ_SKIP_ZERO;
+  }
+  else {
+    if (!is_keyframe && qg == 0) {
+      skip = (icgr ? OD_PVQ_SKIP_ZERO : OD_PVQ_SKIP_COPY);
+    }
+    if (qg == icgr && *itheta == 0 && !cfl_enabled) skip = OD_PVQ_SKIP_COPY;
+  }
+  /* Synthesize like the decoder would. */
+  if (skip) {
+    if (skip == OD_PVQ_SKIP_COPY) OD_COPY(out, r0, n);
+    else OD_CLEAR(out, n);
+  }
+  else {
+    if (noref) gain_offset = 0;
+    g = od_gain_expand(OD_SHL(qg, OD_CGAIN_SHIFT) + gain_offset, q0, beta);
+    od_pvq_synthesis_partial(out, y, r16, n, noref, g, theta, m, s,
+     qm_inv);
+  }
+  *vk = k;
+  *skip_diff += skip_dist - best_dist;
+  /* Encode gain differently depending on whether we use prediction or not.
+     Special encoding on inter frames where qg=0 is allowed for noref=0
+     but not noref=1.*/
+  if (is_keyframe) return noref ? qg : neg_interleave(qg, icgr);
+  else return noref ? qg - 1 : neg_interleave(qg + 1, icgr + 1);
+/** Encodes a single vector of integers (eg, a partition within a
+ *  coefficient block) using PVQ
+ *
+ * @param [in,out] ec         range encoder
+ * @param [in]     qg         quantized gain
+ * @param [in]     theta      quantized post-prediction theta
+ * @param [in]     max_theta  maximum possible quantized theta value
+ * @param [in]     in         coefficient vector to code
+ * @param [in]     n          number of coefficients in partition
+ * @param [in]     k          number of pulses in partition
+ * @param [in,out] model      entropy encoder state
+ * @param [in,out] adapt      adaptation context
+ * @param [in,out] exg        ExQ16 expectation of gain value
+ * @param [in,out] ext        ExQ16 expectation of theta value
+ * @param [in]     nodesync   do not use info that depend on the reference
+ * @param [in]     cdf_ctx    selects which cdf context to use
+ * @param [in]     is_keyframe whether we're encoding a keyframe
+ * @param [in]     code_skip  whether the "skip rest" flag is allowed
+ * @param [in]     skip_rest  when set, we skip all higher bands
+ * @param [in]     encode_flip whether we need to encode the CfL flip flag now
+ * @param [in]     flip       value of the CfL flip flag
+ */
+void pvq_encode_partition(od_ec_enc *ec,
+                                 int qg,
+                                 int theta,
+                                 int max_theta,
+                                 const od_coeff *in,
+                                 int n,
+                                 int k,
+                                 generic_encoder model[3],
+                                 od_adapt_ctx *adapt,
+                                 int *exg,
+                                 int *ext,
+                                 int nodesync,
+                                 int cdf_ctx,
+                                 int is_keyframe,
+                                 int code_skip,
+                                 int skip_rest,
+                                 int encode_flip,
+                                 int flip) {
+  int noref;
+  int id;
+  noref = (theta == -1);
+  id = (qg > 0) + 2*OD_MINI(theta + 1,3) + 8*code_skip*skip_rest;
+  if (is_keyframe) {
+    OD_ASSERT(id != 8);
+    if (id >= 8) id--;
+  }
+  else {
+    OD_ASSERT(id != 10);
+    if (id >= 10) id--;
+  }
+  /* Jointly code gain, theta and noref for small values. Then we handle
+     larger gain and theta values. For noref, theta = -1. */
+  od_encode_cdf_adapt(ec, id, &adapt->pvq.pvq_gaintheta_cdf[cdf_ctx][0],
+   8 + 7*code_skip, adapt->pvq.pvq_gaintheta_increment);
+  if (encode_flip) {
+    /* We could eventually do some smarter entropy coding here, but it would
+       have to be good enough to overcome the overhead of the entropy coder.
+       An early attempt using a "toogle" flag with simple adaptation wasn't
+       worth the trouble. */
+    od_ec_enc_bits(ec, flip, 1);
+  }
+  if (qg > 0) {
+    int tmp;
+    tmp = *exg;
+    generic_encode(ec, &model[!noref], qg - 1, -1, &tmp, 2);
+    OD_IIR_DIADIC(*exg, qg << 16, 2);
+  }
+  if (theta > 1 && (nodesync || max_theta > 3)) {
+    int tmp;
+    tmp = *ext;
+    generic_encode(ec, &model[2], theta - 2, nodesync ? -1 : max_theta - 3,
+     &tmp, 2);
+    OD_IIR_DIADIC(*ext, theta << 16, 2);
+  }
+  od_encode_pvq_codeword(ec, &adapt->pvq.pvq_codeword_ctx, in,
+   n - (theta != -1), k);
+/** Quantizes a scalar with rate-distortion optimization (RDO)
+ * @param [in] x      unquantized value
+ * @param [in] q      quantization step size
+ * @param [in] delta0 rate increase for encoding a 1 instead of a 0
+ * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
+ * @retval quantized value
+ */
+int od_rdo_quant(od_coeff x, int q, double delta0, double pvq_norm_lambda) {
+  int n;
+  /* Optimal quantization threshold is 1/2 + lambda*delta_rate/2. See
+     Jmspeex' Journal of Dubious Theoretical Results for details. */
+  n = OD_DIV_R0(abs(x), q);
+  if ((double)abs(x)/q < (double)n/2 + pvq_norm_lambda*delta0/(2*n)) {
+    return 0;
+  }
+  else {
+    return OD_DIV_R0(x, q);
+  }
+void od_encode_quantizer_scaling(daala_enc_ctx *enc, int q_scaling,
+ int sbx, int sby, int skip) {
+  int nhsb;
+  OD_ASSERT(skip == !!skip);
+  nhsb = enc->state.nhsb;
+  OD_ASSERT(sbx < nhsb);
+  OD_ASSERT(sby < enc->state.nvsb);
+  OD_ASSERT(!skip || q_scaling == 0);
+  enc->state.sb_q_scaling[sby*nhsb + sbx] = q_scaling;
+  if (!skip) {
+    int above;
+    int left;
+    /* use value from neighbour if possible, otherwise use 0 */
+    above = sby > 0 ? enc->state.sb_q_scaling[(sby - 1)*enc->state.nhsb + sbx]
+     : 0;
+    left = sbx > 0 ? enc->state.sb_q_scaling[sby*enc->state.nhsb + (sbx - 1)]
+     : 0;
+    od_encode_cdf_adapt(&enc->ec, q_scaling,
+     enc->state.adapt.q_cdf[above + left*4], 4,
+     enc->state.adapt.q_increment);
+  }
+/** Encode a coefficient block (excepting DC) using PVQ
+ *
+ * @param [in,out] enc     daala encoder context
+ * @param [in]     ref     'reference' (prediction) vector
+ * @param [in]     in      coefficient block to quantize and encode
+ * @param [out]    out     quantized coefficient block
+ * @param [in]     q0      scale/quantizer
+ * @param [in]     pli     plane index
+ * @param [in]     bs      log of the block size minus two
+ * @param [in]     beta    per-band activity masking beta param
+ * @param [in]     robust  make stream robust to error in the reference
+ * @param [in]     is_keyframe whether we're encoding a keyframe
+ * @param [in]     q_scaling scaling factor to apply to quantizer
+ * @param [in]     bx      x-coordinate of this block
+ * @param [in]     by      y-coordinate of this block
+ * @param [in]     qm      QM with magnitude compensation
+ * @param [in]     qm_inv  Inverse of QM with magnitude compensation
+ * @param [in]     speed   Make search faster by making approximations
+ * @param [in]     pvq_info If null, conisdered as RDO search mode
+ * @return         Returns 1 if both DC and AC coefficients are skipped,
+ *                 zero otherwise
+ */
+int od_pvq_encode(daala_enc_ctx *enc,
+                   od_coeff *ref,
+                   const od_coeff *in,
+                   od_coeff *out,
+                   int q_dc,
+                   int q_ac,
+                   int pli,
+                   int bs,
+                   const od_val16 *beta,
+                   int robust,
+                   int is_keyframe,
+                   int q_scaling,
+                   int bx,
+                   int by,
+                   const int16_t *qm,
+                   const int16_t *qm_inv,
+                   int speed,
+                   PVQ_INFO *pvq_info){
+  int theta[PVQ_MAX_PARTITIONS];
+  int max_theta[PVQ_MAX_PARTITIONS];
+  od_coeff y[OD_BSIZE_MAX*OD_BSIZE_MAX];
+  int *exg;
+  int *ext;
+  int nb_bands;
+  int i;
+  const int *off;
+  int size[PVQ_MAX_PARTITIONS];
+  generic_encoder *model;
+  double skip_diff;
+  int tell;
+  uint16_t *skip_cdf;
+  od_rollback_buffer buf;
+  int dc_quant;
+  int flip;
+  int cfl_encoded;
+  int skip_rest;
+  int skip_dir;
+  int skip_theta_value;
+  /* const unsigned char *pvq_qm; */
+  double dc_rate;
+  OD_UNUSED(q_scaling);
+  OD_UNUSED(bx);
+  OD_UNUSED(by);
+  /* TODO(yushin): Enable this for activity masking,
+     when pvq_qm_q4 is available in AOM. */
+  /* pvq_qm = &enc->state.pvq_qm_q4[pli][0]; */
+  exg = &enc->state.adapt.pvq.pvq_exg[pli][bs][0];
+  ext = enc->state.adapt.pvq.pvq_ext + bs*PVQ_MAX_PARTITIONS;
+  skip_cdf = enc->state.adapt.skip_cdf[2*bs + (pli != 0)];
+  model = enc->state.adapt.pvq.pvq_param_model;
+  nb_bands = OD_BAND_OFFSETS[bs][0];
+  off = &OD_BAND_OFFSETS[bs][1];
+  /*dc_quant = OD_MAXI(1, q0*pvq_qm[od_qm_get_index(bs, 0)] >> 4);*/
+  dc_quant = OD_MAXI(1, q_dc);
+  tell = 0;
+  for (i = 0; i < nb_bands; i++) size[i] = off[i+1] - off[i];
+  skip_diff = 0;
+  flip = 0;
+  /*If we are coding a chroma block of a keyframe, we are doing CfL.*/
+  if (pli != 0 && is_keyframe) {
+    od_val32 xy;
+    xy = 0;
+    /*Compute the dot-product of the first band of chroma with the luma ref.*/
+    for (i = off[0]; i < off[1]; i++) {
+#if defined(OD_FLOAT_PVQ)
+      xy += ref[i]*(double)qm[i]*OD_QM_SCALE_1*
+       (double)in[i]*(double)qm[i]*OD_QM_SCALE_1;
+      od_val32 rq;
+      od_val32 inq;
+      rq = ref[i]*qm[i];
+      inq = in[i]*qm[i];
+      xy += OD_SHR(rq*(int64_t)inq, OD_SHL(OD_QM_SHIFT + OD_CFL_FLIP_SHIFT,
+       1));
+    }
+    /*If cos(theta) < 0, then |theta| > pi/2 and we should negate the ref.*/
+    if (xy < 0) {
+      flip = 1;
+      for(i = off[0]; i < off[nb_bands]; i++) ref[i] = -ref[i];
+    }
+  }
+  for (i = 0; i < nb_bands; i++) {
+    int q;
+    /* TODO(yushin): Enable this for activity masking,
+       when pvq_qm_q4 is available in AOM. */
+    /*q = OD_MAXI(1, q0*pvq_qm[od_qm_get_index(bs, i + 1)] >> 4);*/
+    q = OD_MAXI(1, q_ac);
+    qg[i] = pvq_theta(out + off[i], in + off[i], ref + off[i], size[i],
+     q, y + off[i], &theta[i], &max_theta[i],
+     &k[i], beta[i], &skip_diff, robust, is_keyframe, pli, &enc->state.adapt,
+     qm + off[i], qm_inv + off[i], enc->pvq_norm_lambda, speed);
+  }
+  od_encode_checkpoint(enc, &buf);
+  if (is_keyframe) out[0] = 0;
+  else {
+    int n;
+    n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant);
+    if (n == 0) {
+      out[0] = 0;
+    } else if (pli == 0) {
+    } else {
+      int tell2;
+      od_rollback_buffer dc_buf;
+      dc_rate = -OD_LOG2((double)(skip_cdf[3] - skip_cdf[2])/
+       (double)(skip_cdf[2] - skip_cdf[1]));
+      dc_rate += 1;
+      tell2 = od_ec_enc_tell_frac(&enc->ec);
+      od_encode_checkpoint(enc, &dc_buf);
+      generic_encode(&enc->ec, &enc->state.adapt.model_dc[pli],
+       n - 1, -1, &enc->state.adapt.ex_dc[pli][bs][0], 2);
+      tell2 = od_ec_enc_tell_frac(&enc->ec) - tell2;
+      dc_rate += tell2/8.0;
+      od_encode_rollback(enc, &dc_buf);
+      out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate,
+       enc->pvq_norm_lambda);
+    }
+  }
+  tell = od_ec_enc_tell_frac(&enc->ec);
+  /* Code as if we're not skipping. */
+  od_encode_cdf_adapt(&enc->ec, 2 + (out[0] != 0), skip_cdf,
+   4, enc->state.adapt.skip_increment);
+  if (pvq_info)
+    pvq_info->ac_dc_coded = 2 + (out[0] != 0);
+  if (bs == OD_NBSIZES - 1 && pli == 0) {
+    od_encode_quantizer_scaling(enc, q_scaling, bx >> (OD_NBSIZES - 1),
+     by >> (OD_NBSIZES - 1), 0);
+  }
+  cfl_encoded = 0;
+  skip_rest = 1;
+  skip_theta_value = is_keyframe ? -1 : 0;
+  for (i = 1; i < nb_bands; i++) {
+    if (theta[i] != skip_theta_value || qg[i]) skip_rest = 0;
+  }
+  skip_dir = 0;
+  if (nb_bands > 1) {
+    for (i = 0; i < 3; i++) {
+      int j;
+      int tmp;
+      tmp = 1;
+      for (j = i + 1; j < nb_bands; j += 3) {
+        if (theta[j] != skip_theta_value || qg[j]) tmp = 0;
+      }
+      skip_dir |= tmp << i;
+    }
+  }
+  if (theta[0] == skip_theta_value && qg[0] == 0 && skip_rest) nb_bands = 0;
+  /* NOTE: There was no other better place to put this function. */
+  if (pvq_info)
+    av1_store_pvq_enc_info(pvq_info, qg, theta, max_theta, k,
+      y, nb_bands, off, size,
+      skip_rest, skip_dir, bs);
+  for (i = 0; i < nb_bands; i++) {
+    int encode_flip;
+    /* Encode CFL flip bit just after the first time it's used. */
+    encode_flip = pli != 0 && is_keyframe && theta[i] != -1 && !cfl_encoded;
+    if (i == 0 || (!skip_rest && !(skip_dir & (1 << ((i - 1)%3))))) {
+      pvq_encode_partition(&enc->ec, qg[i], theta[i], max_theta[i], y + off[i],
+       size[i], k[i], model, &enc->state.adapt, exg + i, ext + i,
+       robust || is_keyframe, (pli != 0)*OD_NBSIZES*PVQ_MAX_PARTITIONS
+       + bs*PVQ_MAX_PARTITIONS + i, is_keyframe, i == 0 && (i < nb_bands - 1),
+       skip_rest, encode_flip, flip);
+    }
+    if (i == 0 && !skip_rest && bs > 0) {
+      od_encode_cdf_adapt(&enc->ec, skip_dir,
+       &enc->state.adapt.pvq.pvq_skip_dir_cdf[(pli != 0) + 2*(bs - 1)][0], 7,
+       enc->state.adapt.pvq.pvq_skip_dir_increment);
+    }
+    if (encode_flip) cfl_encoded = 1;
+  }
+  tell = od_ec_enc_tell_frac(&enc->ec) - tell;
+  /* Account for the rate of skipping the AC, based on the same DC decision
+     we made when trying to not skip AC. */
+  {
+    double skip_rate;
+    if (out[0] != 0) {
+      skip_rate = -OD_LOG2((skip_cdf[1] - skip_cdf[0])/
+     (double)skip_cdf[3]);
+    }
+    else {
+      skip_rate = -OD_LOG2(skip_cdf[0]/
+     (double)skip_cdf[3]);
+    }
+    tell -= (int)floor(.5+8*skip_rate);
+  }
+  if (nb_bands == 0 || skip_diff <= enc->pvq_norm_lambda/8*tell) {
+    if (is_keyframe) out[0] = 0;
+    else {
+      int n;
+      n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant);
+      if (n == 0) {
+        out[0] = 0;
+      } else if (pli == 0) {
+      } else {
+        int tell2;
+        od_rollback_buffer dc_buf;
+        dc_rate = -OD_LOG2((double)(skip_cdf[1] - skip_cdf[0])/
+         (double)skip_cdf[0]);
+        dc_rate += 1;
+        tell2 = od_ec_enc_tell_frac(&enc->ec);
+        od_encode_checkpoint(enc, &dc_buf);
+        generic_encode(&enc->ec, &enc->state.adapt.model_dc[pli],
+         n - 1, -1, &enc->state.adapt.ex_dc[pli][bs][0], 2);
+        tell2 = od_ec_enc_tell_frac(&enc->ec) - tell2;
+        dc_rate += tell2/8.0;
+        od_encode_rollback(enc, &dc_buf);
+        out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate,
+         enc->pvq_norm_lambda);
+      }
+    }
+    /* We decide to skip, roll back everything as it was before. */
+    od_encode_rollback(enc, &buf);
+    od_encode_cdf_adapt(&enc->ec, out[0] != 0, skip_cdf,
+     4, enc->state.adapt.skip_increment);
+    if (pvq_info)
+      pvq_info->ac_dc_coded = (out[0] != 0);
+    if (bs == OD_NBSIZES - 1 && pli == 0) {
+      int skip;
+      skip = out[0] == 0;
+      if (skip) {
+        q_scaling = 0;
+      }
+      od_encode_quantizer_scaling(enc, q_scaling, bx >> (OD_NBSIZES - 1),
+       by >> (OD_NBSIZES - 1), skip);
+    }
+    if (is_keyframe) for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = 0;
+    else for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = ref[i];
+    if (out[0] == 0) return 1;
+  }
+  return 0;
diff --git a/av1/encoder/pvq_encoder.h b/av1/encoder/pvq_encoder.h
new file mode 100644
index 0000000..6cf1c3b
--- /dev/null
+++ b/av1/encoder/pvq_encoder.h
@@ -0,0 +1,60 @@
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
+/* clang-format off */
+#if !defined(_pvq_encoder_H)
+# define _pvq_encoder_H (1)
+# include "aom_dsp/entenc.h"
+# include "av1/common/blockd.h"
+# include "av1/common/pvq.h"
+# include "av1/encoder/encint.h"
+#define PVQ_CHROMA_RD 1
+void od_encode_band_pvq_splits(od_ec_enc *ec, od_pvq_codeword_ctx *adapt,
+ const int *y, int n, int k, int level);
+void od_laplace_encode_special(od_ec_enc *enc, int x, unsigned decay, int max);
+void od_laplace_encode(od_ec_enc *enc, int x, int ex_q8, int k);
+void od_laplace_encode_vector(od_ec_enc *enc, const od_coeff *y, int n, int k,
+                                  int32_t *curr, const int32_t *means);
+void od_encode_quantizer_scaling(daala_enc_ctx *enc, int q_scaling, int bx,
+ int by, int skip);
+void pvq_encode_partition(od_ec_enc *ec,
+                                 int qg,
+                                 int theta,
+                                 int max_theta,
+                                 const od_coeff *in,
+                                 int n,
+                                 int k,
+                                 generic_encoder model[3],
+                                 od_adapt_ctx *adapt,
+                                 int *exg,
+                                 int *ext,
+                                 int nodesync,
+                                 int cdf_ctx,
+                                 int is_keyframe,
+                                 int code_skip,
+                                 int skip_rest,
+                                 int encode_flip,
+                                 int flip);
+int od_pvq_encode(daala_enc_ctx *enc, od_coeff *ref, const od_coeff *in,
+ od_coeff *out, int q_dc, int q_ac, int pli, int bs, const od_val16 *beta, int robust,
+ int is_keyframe, int q_scaling, int bx, int by, const int16_t *qm,
+ const int16_t *qm_inv, int speed, PVQ_INFO *pvq_info);
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 0b9c91e..92ef15a 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -48,6 +48,10 @@
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
+#include "av1/encoder/pvq_encoder.h"
 #define LAST_FRAME_MODE_MASK                                      \
@@ -302,6 +306,33 @@
   *out_dist_sum = dist_sum << 4;
+// Without PVQ, av1_block_error_c() return two kind of errors,
+// 1) reconstruction (i.e. decoded) error and
+// 2) Squared sum of transformed residue (i.e. 'coeff')
+// However, if PVQ is enabled, coeff does not keep the transformed residue
+// but instead a transformed original is kept.
+// Hence, new parameter ref vector (i.e. transformed predicted signal)
+// is required to derive the residue signal,
+// i.e. coeff - ref = residue (all transformed).
+// TODO(yushin) : Since 4x4 case does not need ssz, better to refactor into
+// a separate function that does not do the extra computations for ssz.
+int64_t av1_block_error2_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                           const tran_low_t *ref, intptr_t block_size,
+                           int64_t *ssz) {
+  int64_t error;
+  // Use the existing sse codes for calculating distortion of decoded signal:
+  // i.e. (orig - decoded)^2
+  error = av1_block_error_fp(coeff, dqcoeff, block_size);
+  // prediction residue^2 = (orig - ref)^2
+  *ssz = av1_block_error_fp(coeff, ref, block_size);
+  return error;
 int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
                           intptr_t block_size, int64_t *ssz) {
   int i;
@@ -353,6 +384,7 @@
 /* The trailing '0' is a terminator which is used inside cost_coeffs() to
  * decide whether to include cost of a trailing EOB node or not (i.e. we
  * can skip this if the last coefficient in this transform block, e.g. the
@@ -455,6 +487,7 @@
   return cost;
 static void dist_block(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
                        int64_t *out_dist, int64_t *out_sse) {
@@ -466,11 +499,18 @@
   int shift = tx_size == TX_32X32 ? 0 : 2;
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
   const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
   *out_dist =
       av1_highbd_block_error(coeff, dqcoeff, ss_txfrm_size, &this_sse, bd) >>
+  *out_dist =
+      av1_block_error2_c(coeff, dqcoeff, ref_coeff, ss_txfrm_size, &this_sse) >>
+      shift;
   *out_dist =
       av1_block_error(coeff, dqcoeff, ss_txfrm_size, &this_sse) >> shift;
@@ -478,12 +518,14 @@
   *out_sse = this_sse >> shift;
 static int rate_block(int plane, int block, int blk_row, int blk_col,
                       TX_SIZE tx_size, struct rdcost_block_args *args) {
   return cost_coeffs(args->cm, args->x, plane, block, args->t_above + blk_col,
                      args->t_left + blk_row, tx_size, args->scan_order->scan,
                      args->scan_order->neighbors, args->use_fast_coef_costing);
 static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
                           BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
@@ -516,8 +558,11 @@
     args->exit_early = 1;
   rate = rate_block(plane, block, blk_row, blk_col, tx_size, args);
+  rate = x->rate;
   rd1 = RDCOST(x->rdmult, x->rddiv, rate, dist);
   rd2 = RDCOST(x->rdmult, x->rddiv, 0, sse);
@@ -533,8 +578,11 @@
     args->exit_early = 1;
   args->skippable &= !x->plane[plane].eobs[block];
+  args->skippable &= x->pvq_skip[plane];
 static void txfm_rd_in_plane(const AV1_COMMON *const cm, MACROBLOCK *x,
@@ -599,11 +647,22 @@
   *sse = INT64_MAX;
   mbmi->tx_size = AOMMIN(max_tx_size, largest_tx_size);
   if (mbmi->tx_size < TX_32X32 && !xd->lossless[mbmi->segment_id]) {
-    for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+    od_rollback_buffer pre_buf, post_buf;
+    od_encode_checkpoint(&x->daala_enc, &pre_buf);
+    od_encode_checkpoint(&x->daala_enc, &post_buf);
+    for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
       mbmi->tx_type = tx_type;
       txfm_rd_in_plane(cm, x, &r, &d, &s, &psse, ref_best_rd, 0, bs,
                        mbmi->tx_size, cpi->sf.use_fast_coef_costing);
+      od_encode_rollback(&x->daala_enc, &pre_buf);
       if (r == INT_MAX) continue;
       if (is_inter)
         r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
@@ -625,8 +684,14 @@
         *rate = r;
         *skip = s;
         *sse = psse;
+        od_encode_checkpoint(&x->daala_enc, &post_buf);
+    od_encode_rollback(&x->daala_enc, &post_buf);
   } else {
     txfm_rd_in_plane(cm, x, rate, distortion, skip, sse, ref_best_rd, 0, bs,
                      mbmi->tx_size, cpi->sf.use_fast_coef_costing);
@@ -670,6 +735,9 @@
   const int is_inter = is_inter_block(mbmi);
   const aom_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
+  od_rollback_buffer buf;
   assert(skip_prob > 0);
   s0 = av1_cost_bit(skip_prob, 0);
   s1 = av1_cost_bit(skip_prob, 1);
@@ -689,6 +757,10 @@
   *skip = 0;
   *psse = INT64_MAX;
+  od_encode_checkpoint(&x->daala_enc, &buf);
   for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
     if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue;
@@ -710,6 +782,9 @@
       mbmi->tx_type = tx_type;
       txfm_rd_in_plane(cm, x, &r, &d, &s, &sse, ref_best_rd, 0, bs, n,
+      od_encode_rollback(&x->daala_enc, &buf);
       if (n < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
           r != INT_MAX) {
         if (is_inter)
@@ -761,6 +836,11 @@
   mbmi->tx_type = best_tx_type;
   if (mbmi->tx_size >= TX_32X32) assert(mbmi->tx_type == DCT_DCT);
+  if (best_tx < TX_SIZES)
+    txfm_rd_in_plane(cm, x, &r, &d, &s, &sse, ref_best_rd, 0, bs, best_tx,
+                     cpi->sf.use_fast_coef_costing);
 static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, int *rate,
@@ -954,7 +1034,9 @@
                                      ENTROPY_CONTEXT *l, int *bestrate,
                                      int *bestratey, int64_t *bestdistortion,
                                      BLOCK_SIZE bsize, int64_t rd_thresh) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   int64_t best_rd = rd_thresh;
@@ -974,6 +1056,12 @@
   uint16_t best_dst16[8 * 8];
+  od_rollback_buffer pre_buf, post_buf;
+  od_encode_checkpoint(&x->daala_enc, &pre_buf);
+  od_encode_checkpoint(&x->daala_enc, &post_buf);
   memcpy(ta, a, num_4x4_blocks_wide * sizeof(a[0]));
   memcpy(tl, l, num_4x4_blocks_high * sizeof(l[0]));
   xd->mi[0]->mbmi.tx_size = TX_4X4;
@@ -1082,6 +1170,10 @@
+  od_encode_checkpoint(&x->daala_enc, &pre_buf);
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     int64_t this_rd;
     int ratey = 0;
@@ -1104,15 +1196,45 @@
         const int block = (row + idy) * 2 + (col + idx);
         const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+        tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
         int16_t *const src_diff =
             av1_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
-        tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+        int lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+        const int diff_stride = 8;
+        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+        tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
+        int16_t *pred = &pd->pred[4 * (row * diff_stride + col)];
+        int16_t *src_int16 = &p->src_int16[4 * (row * diff_stride + col)];
+        int i, j, tx_blk_size;
+        TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
+        int rate_pvq;
+        int skip;
         xd->mi[0]->bmi[block].as_mode = mode;
         av1_predict_intra_block(xd, 1, 1, TX_4X4, mode, dst, dst_stride, dst,
                                 dst_stride, col + idx, row + idy, 0);
         aom_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
+        if (lossless) tx_type = DCT_DCT;
+        // transform block size in pixels
+        tx_blk_size = 4;
+        // copy uint8 orig and predicted block to int16 buffer
+        // in order to use existing VP10 transform functions
+        for (j = 0; j < tx_blk_size; j++)
+          for (i = 0; i < tx_blk_size; i++) {
+            src_int16[diff_stride * j + i] = src[src_stride * j + i];
+            pred[diff_stride * j + i] = dst[dst_stride * j + i];
+          }
+        av1_fwd_txfm_4x4(src_int16, coeff, diff_stride, tx_type, lossless);
+        av1_fwd_txfm_4x4(pred, ref_coeff, diff_stride, tx_type, lossless);
         if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
           TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
           const SCAN_ORDER *scan_order = get_scan(cm, TX_4X4, tx_type);
           av1_fwd_txfm_4x4(src_diff, coeff, 8, DCT_DCT, 1);
@@ -1121,12 +1243,27 @@
           ratey += cost_coeffs(cm, x, 0, block, tempa + idx, templ + idy,
                                TX_4X4, scan_order->scan, scan_order->neighbors,
+          skip = av1_pvq_encode_helper(&x->daala_enc, coeff, ref_coeff, dqcoeff,
+                                       &p->eobs[block], pd->dequant, 0, TX_4X4,
+                                       tx_type, &rate_pvq, x->pvq_speed, NULL);
+          ratey += rate_pvq;
           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
             goto next;
-          av1_inv_txfm_add_4x4(BLOCK_OFFSET(pd->dqcoeff, block), dst,
-                               dst_stride, p->eobs[block], DCT_DCT, 1);
+          if (!skip) {
+            for (j = 0; j < tx_blk_size; j++)
+              for (i = 0; i < tx_blk_size; i++) dst[j * dst_stride + i] = 0;
+            av1_inv_txfm_add_4x4(BLOCK_OFFSET(pd->dqcoeff, block), dst,
+                                 dst_stride, p->eobs[block], DCT_DCT, 1);
+          }
         } else {
           int64_t unused;
           TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
           const SCAN_ORDER *scan_order = get_scan(cm, TX_4X4, tx_type);
           av1_fwd_txfm_4x4(src_diff, coeff, 8, tx_type, 0);
@@ -1135,16 +1272,31 @@
           ratey += cost_coeffs(cm, x, 0, block, tempa + idx, templ + idy,
                                TX_4X4, scan_order->scan, scan_order->neighbors,
+          skip = av1_pvq_encode_helper(&x->daala_enc, coeff, ref_coeff, dqcoeff,
+                                       &p->eobs[block], pd->dequant, 0, TX_4X4,
+                                       tx_type, &rate_pvq, x->pvq_speed, NULL);
+          ratey += rate_pvq;
+          // No need for av1_block_error2_c because the ssz is unused
           distortion += av1_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
                                         16, &unused) >>
           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
             goto next;
-          av1_inv_txfm_add_4x4(BLOCK_OFFSET(pd->dqcoeff, block), dst,
-                               dst_stride, p->eobs[block], tx_type, 0);
+          if (!skip) {
+            for (j = 0; j < tx_blk_size; j++)
+              for (i = 0; i < tx_blk_size; i++) dst[j * dst_stride + i] = 0;
+            av1_inv_txfm_add_4x4(BLOCK_OFFSET(pd->dqcoeff, block), dst,
+                                 dst_stride, p->eobs[block], tx_type, 0);
+          }
-    }
+    }  // idy loop
     rate += ratey;
     this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
@@ -1157,15 +1309,25 @@
       *best_mode = mode;
       memcpy(a, tempa, num_4x4_blocks_wide * sizeof(tempa[0]));
       memcpy(l, templ, num_4x4_blocks_high * sizeof(templ[0]));
+      od_encode_checkpoint(&x->daala_enc, &post_buf);
       for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
         memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
                num_4x4_blocks_wide * 4);
   next : {}
-  }
+    od_encode_rollback(&x->daala_enc, &pre_buf);
+  }  // mode decision loop
   if (best_rd >= rd_thresh) return best_rd;
+  od_encode_rollback(&x->daala_enc, &post_buf);
   for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
     memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
            num_4x4_blocks_wide * 4);
@@ -1537,6 +1699,12 @@
   const MODE_INFO *left_mi = xd->left_mi;
   const PREDICTION_MODE A = av1_above_block_mode(xd->mi[0], above_mi, 0);
   const PREDICTION_MODE L = av1_left_block_mode(xd->mi[0], left_mi, 0);
+  od_rollback_buffer pre_buf, post_buf;
+  od_encode_checkpoint(&x->daala_enc, &pre_buf);
+  od_encode_checkpoint(&x->daala_enc, &post_buf);
   bmode_costs = cpi->y_mode_costs[A][L];
@@ -1564,6 +1732,9 @@
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     mbmi->mode = mode;
+    od_encode_rollback(&x->daala_enc, &pre_buf);
     if (is_directional_mode(mbmi->mode)) {
       if (directional_mode_skip_mask[mbmi->mode]) continue;
@@ -1613,9 +1784,16 @@
       *rate_tokenonly = this_rate_tokenonly;
       *distortion = this_distortion;
       *skippable = s;
+      od_encode_checkpoint(&x->daala_enc, &post_buf);
+  od_encode_rollback(&x->daala_enc, &post_buf);
   if (cpi->common.allow_screen_content_tools)
     rd_pick_palette_intra_sby(cpi, x, bsize, palette_ctx, bmode_costs[DC_PRED],
@@ -1659,12 +1837,12 @@
   int is_cost_valid = 1;
   if (ref_best_rd < 0) is_cost_valid = 0;
   if (is_inter_block(mbmi) && is_cost_valid) {
     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
       av1_subtract_plane(x, bsize, plane);
   *rate = 0;
   *distortion = 0;
   *sse = 0;
@@ -1948,6 +2126,11 @@
   int64_t best_rd = INT64_MAX, this_rd;
   int this_rate_tokenonly, this_rate, s;
   int64_t this_distortion, this_sse;
+  od_rollback_buffer buf;
+  od_encode_checkpoint(&x->daala_enc, &buf);
   MACROBLOCKD *const xd = &x->e_mbd;
   const int rows =
@@ -1982,15 +2165,23 @@
     } else {
       mbmi->intra_angle_delta[1] = 0;
       if (!super_block_uvrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
-                            &this_sse, bsize, best_rd))
+                            &this_sse, bsize, best_rd)) {
+        od_encode_rollback(&x->daala_enc, &buf);
+      }
       rate_overhead = cpi->intra_uv_mode_cost[mbmi->mode][mode];
     this_rate = this_rate_tokenonly + rate_overhead;
     if (!super_block_uvrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
-                          &this_sse, bsize, best_rd))
+                          &this_sse, bsize, best_rd)) {
+      od_encode_rollback(&x->daala_enc, &buf);
+    }
     this_rate = this_rate_tokenonly + cpi->intra_uv_mode_cost[mbmi->mode][mode];
 #endif  // CONFIG_EXT_INTRA
@@ -2000,7 +2191,22 @@
       this_rate += av1_cost_bit(
           av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 0);
 #endif  // CONFIG_PALETTE
+    // For chroma channels, multiply lambda by 0.5 when doing intra prediction
+    // NOTE: Chroma intra prediction itself has a separate RDO,
+    // though final chroma intra mode's D and R is simply added to
+    // those of luma then global RDO is performed to decide the modes of SB.
+    // Also, for chroma, the RDO cannot decide tx_size (follow luma's decision)
+    // or tx_type (DCT only), then only the intra prediction is
+    // chroma's own mode decision based on separate RDO.
+    // TODO(yushin) : Seek for more reasonable solution than this.
+    this_rd = RDCOST(x->rdmult >> (1 * PVQ_CHROMA_RD), x->rddiv, this_rate,
+                     this_distortion);
+    od_encode_rollback(&x->daala_enc, &buf);
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
     if (this_rd < best_rd) {
       mode_selected = mode;
@@ -2161,11 +2367,14 @@
 static int64_t encode_inter_mb_segment(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                       int64_t best_yrd, int i, int *labelyrate,
-                                       int64_t *distortion, int64_t *sse,
-                                       ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl,
-                                       int ir, int ic, int mi_row, int mi_col) {
+                                       int64_t best_yrd, int block,
+                                       int *labelyrate, int64_t *distortion,
+                                       int64_t *sse, ENTROPY_CONTEXT *ta,
+                                       ENTROPY_CONTEXT *tl, int ir, int ic,
+                                       int mi_row, int mi_col) {
   const AV1_COMMON *const cm = &cpi->common;
   int k;
   MACROBLOCKD *xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[0];
@@ -2176,17 +2385,22 @@
   const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
   int idx, idy;
   void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
   const uint8_t *const src =
-      &p->src.buf[av1_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+      &p->src.buf[av1_raster_block_offset(BLOCK_8X8, block, p->src.stride)];
   uint8_t *const dst =
-      &pd->dst.buf[av1_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)];
+      &pd->dst.buf[av1_raster_block_offset(BLOCK_8X8, block, pd->dst.stride)];
   int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0;
-  TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, i);
+  TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
   const SCAN_ORDER *scan_order = get_scan(cm, TX_4X4, tx_type);
+  (void)cpi;
+  (void)ta;
+  (void)tl;
-  av1_build_inter_predictor_sub8x8(xd, 0, i, ir, ic, mi_row, mi_col);
+  av1_build_inter_predictor_sub8x8(xd, 0, block, ir, ic, mi_row, mi_col);
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -2199,33 +2413,72 @@
   fwd_txm4x4 = xd->lossless[mi->mbmi.segment_id] ? av1_fwht4x4 : aom_fdct4x4;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        height, width, av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
-        8, src, p->src.stride, dst, pd->dst.stride, xd->bd);
+        height, width,
+        av1_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff), 8, src,
+        p->src.stride, dst, pd->dst.stride, xd->bd);
   } else {
-    aom_subtract_block(height, width,
-                       av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+    aom_subtract_block(height, width, av1_raster_block_offset_int16(
+                                          BLOCK_8X8, block, p->src_diff),
                        8, src, p->src.stride, dst, pd->dst.stride);
-  aom_subtract_block(height, width,
-                     av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+  aom_subtract_block(height, width, av1_raster_block_offset_int16(
+                                        BLOCK_8X8, block, p->src_diff),
                      8, src, p->src.stride, dst, pd->dst.stride);
+#endif  // !CONFIG_PVQ
-  k = i;
+  k = block;
   for (idy = 0; idy < height / 4; ++idy) {
     for (idx = 0; idx < width / 4; ++idx) {
       int64_t ssz, rd, rd1, rd2;
       tran_low_t *coeff;
+      const int src_stride = p->src.stride;
+      const int dst_stride = pd->dst.stride;
+      const int diff_stride = 8;
+      tran_low_t *dqcoeff;
+      tran_low_t *ref_coeff;
+      int16_t *pred = &pd->pred[4 * (ir * diff_stride + ic)];
+      int16_t *src_int16 = &p->src_int16[4 * (ir * diff_stride + ic)];
+      int i, j, tx_blk_size;
+      int rate_pvq;
       k += (idy * 2 + idx);
       coeff = BLOCK_OFFSET(p->coeff, k);
       fwd_txm4x4(av1_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                  coeff, 8);
       av1_regular_quantize_b_4x4(x, 0, k, scan_order->scan, scan_order->iscan);
+      dqcoeff = BLOCK_OFFSET(pd->dqcoeff, k);
+      ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, k);
+      // transform block size in pixels
+      tx_blk_size = 4;
+      // copy uint8 orig and predicted block to int16 buffer
+      // in order to use existing VP10 transform functions
+      for (j = 0; j < tx_blk_size; j++)
+        for (i = 0; i < tx_blk_size; i++) {
+          src_int16[diff_stride * j + i] =
+              src[src_stride * (j + 4 * idy) + (i + 4 * idx)];
+          pred[diff_stride * j + i] =
+              dst[dst_stride * (j + 4 * idy) + (i + 4 * idx)];
+        }
+      fwd_txm4x4(src_int16, coeff, diff_stride);
+      fwd_txm4x4(pred, ref_coeff, diff_stride);
+      av1_pvq_encode_helper(&x->daala_enc, coeff, ref_coeff, dqcoeff,
+                            &p->eobs[k], pd->dequant, 0, TX_4X4, tx_type,
+                            &rate_pvq, x->pvq_speed, NULL);
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         thisdistortion += av1_highbd_block_error(
@@ -2234,14 +2487,21 @@
         thisdistortion +=
             av1_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz);
+      thisdistortion += av1_block_error2_c(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
+                                           ref_coeff, 16, &ssz);
       thisdistortion +=
           av1_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz);
       thissse += ssz;
       thisrate += cost_coeffs(cm, x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
                               scan_order->scan, scan_order->neighbors,
+      thisrate += rate_pvq;
       rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
       rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
       rd = AOMMIN(rd1, rd2);
@@ -2588,6 +2848,11 @@
   const int has_second_rf = has_second_ref(mbmi);
   const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  od_rollback_buffer pre_buf;
+  od_encode_checkpoint(&x->daala_enc, &pre_buf);
@@ -2619,6 +2884,11 @@
       int64_t new_best_rd = INT64_MAX;
       const int index = idy * 2 + idx;
       int ref;
+      od_rollback_buffer idx_buf, post_buf;
+      od_encode_checkpoint(&x->daala_enc, &idx_buf);
+      od_encode_checkpoint(&x->daala_enc, &post_buf);
       for (ref = 0; ref < 1 + has_second_rf; ++ref) {
         const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
@@ -2646,6 +2916,9 @@
         memcpy(bsi->rdstat[index][mode_idx].tl, t_left,
+        od_encode_rollback(&x->daala_enc, &idx_buf);
         // motion search for newmv (single predictor case only)
         if (!has_second_rf && this_mode == NEWMV &&
@@ -2847,6 +3120,9 @@
             if (bsi->rdstat[index][mode_idx].brdcost < new_best_rd) {
               mode_selected = this_mode;
               new_best_rd = bsi->rdstat[index][mode_idx].brdcost;
+              od_encode_checkpoint(&x->daala_enc, &post_buf);
@@ -2873,6 +3149,10 @@
         if (bsi->rdstat[index][mode_idx].brdcost < new_best_rd) {
           mode_selected = this_mode;
           new_best_rd = bsi->rdstat[index][mode_idx].brdcost;
+          od_encode_checkpoint(&x->daala_enc, &post_buf);
       } /*for each 4x4 mode*/
@@ -2882,12 +3162,18 @@
           for (midx = 0; midx < INTER_MODES; ++midx)
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
+        od_encode_rollback(&x->daala_enc, &pre_buf);
         return INT64_MAX;
       mode_idx = INTER_OFFSET(mode_selected);
       memcpy(t_above, bsi->rdstat[index][mode_idx].ta, sizeof(t_above));
       memcpy(t_left, bsi->rdstat[index][mode_idx].tl, sizeof(t_left));
+      od_encode_rollback(&x->daala_enc, &post_buf);
       set_and_cost_bmi_mvs(cpi, x, xd, index, mode_selected,
                            mode_mv[mode_selected], frame_mv, seg_mvs[index],
@@ -2905,10 +3191,16 @@
           for (midx = 0; midx < INTER_MODES; ++midx)
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
+        od_encode_rollback(&x->daala_enc, &pre_buf);
         return INT64_MAX;
   } /* for each label */
+  od_encode_rollback(&x->daala_enc, &pre_buf);
   bsi->r = br;
   bsi->d = bd;
@@ -3721,8 +4013,10 @@
       int64_t sseuv = INT64_MAX;
       int64_t rdcosty = INT64_MAX;
-      // Y cost and distortion
+// Y cost and distortion
       av1_subtract_plane(x, bsize, 0);
       super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse, bsize,
@@ -4087,6 +4381,10 @@
   int64_t mode_threshold[MAX_MODES];
   int *mode_map = tile_data->mode_map[bsize];
   const int mode_search_skip_flags = sf->mode_search_skip_flags;
+  od_rollback_buffer pre_buf;
   const int rows = 4 * num_4x4_blocks_high_lookup[bsize];
   const int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
@@ -4318,6 +4616,9 @@
     midx = end_pos;
+  od_encode_checkpoint(&x->daala_enc, &pre_buf);
   for (midx = 0; midx < MAX_MODES; ++midx) {
     int mode_index = mode_map[midx];
     int mode_excluded = 0;
@@ -4332,6 +4633,9 @@
     uint8_t ref_frame_type;
+    od_encode_rollback(&x->daala_enc, &pre_buf);
     this_mode = av1_mode_order[mode_index].mode;
     ref_frame = av1_mode_order[mode_index].ref_frame[0];
     second_ref_frame = av1_mode_order[mode_index].ref_frame[1];
@@ -5231,6 +5535,11 @@
   int ref_frame_skip_mask[2] = { 0 };
   int internal_active_edge =
       av1_active_edge_sb(cpi, mi_row, mi_col) && av1_internal_image_edge(cpi);
+  od_rollback_buffer pre_buf;
+  od_encode_checkpoint(&x->daala_enc, &pre_buf);
@@ -5278,6 +5587,10 @@
     int this_skip2 = 0;
     int64_t total_sse = INT_MAX;
+    od_encode_rollback(&x->daala_enc, &pre_buf);
     ref_frame = av1_ref_order[ref_index].ref_frame[0];
     second_ref_frame = av1_ref_order[ref_index].ref_frame[1];
diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index d5afaae..142bde8 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c
@@ -304,6 +304,7 @@
 static void set_entropy_context_b(int plane, int block, int blk_row,
                                   int blk_col, BLOCK_SIZE plane_bsize,
                                   TX_SIZE tx_size, void *arg) {
@@ -473,6 +474,7 @@
   av1_set_contexts(xd, pd, tx_size, c > 0, blk_col, blk_row);
 struct is_skippable_args {
   uint16_t *eobs;
@@ -520,6 +522,42 @@
   return result;
+void add_pvq_block(AV1_COMMON *const cm, MACROBLOCK *const x, PVQ_INFO *pvq) {
+  PVQ_QUEUE *q = x->pvq_q;
+  if (q->curr_pos >= q->buf_len) {
+    q->buf_len = 2 * q->buf_len + 1;
+    CHECK_MEM_ERROR(cm, q->buf, aom_realloc(q->buf, q->buf_len * sizeof(PVQ_INFO)));
+  }
+  //memcpy(q->buf + q->curr_pos, pvq, sizeof(PVQ_INFO));
+  OD_COPY(q->buf + q->curr_pos, pvq, 1);
+  ++q->curr_pos;
+// NOTE: This does not actually generate tokens, instead we store the encoding
+// decisions made for PVQ in a queue that we will read from when
+// actually writing the bitstream in write_modes_b
+static void tokenize_pvq(int plane, int block, int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args *const args = arg;
+  const AV1_COMP *cpi = args->cpi;
+  const AV1_COMMON *const cm = &cpi->common;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  PVQ_INFO *pvq_info;
+  (void)block;
+  (void)blk_row;
+  (void)blk_col;
+  (void)plane_bsize;
+  (void)tx_size;
+  assert(block < MAX_PVQ_BLOCKS_IN_SB);
+  pvq_info = &x->pvq[block][plane];
+  add_pvq_block((AV1_COMMON *const)cm, x, pvq_info);
 void av1_tokenize_sb(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
                      int dry_run, BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
@@ -536,11 +574,11 @@
   if (!dry_run) {
     int plane;
     td->counts->skip[ctx][0] += skip_inc;
     for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
       av1_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b,
@@ -550,4 +588,15 @@
   } else {
     av1_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
+  if (!dry_run) {
+    int plane;
+    td->counts->skip[ctx][0] += skip_inc;
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+      av1_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_pvq,
+                                             &arg);
+  }
diff --git a/configure b/configure
index 64d48b0..7f867e5 100755
--- a/configure
+++ b/configure
@@ -259,6 +259,7 @@
+    pvq
diff --git a/test/ b/test/
index ea6da47..d3a134a 100644
--- a/test/
+++ b/test/
@@ -1,26 +1,13 @@
-/*Daala video codec
-Copyright (c) 2013 Daala project contributors.  All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-- Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-- Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at
+ */
 #include <stdlib.h>