New experiment: Perceptual Vector Quantization from Daala

PVQ replaces the scalar quantizer and coefficient coding with a new
design originally developed in Daala. It currently depends on the
Daala entropy coder although it could be adapted to work with another
entropy coder if needed:
./configure --enable-experimental --enable-daala_ec --enable-pvq

The version of PVQ in this commit is adapted from the following
revision of Daala:
https://github.com/xiph/daala/commit/fb51c1ade6a31b668a0157d89de8f0a4493162a8

More information about PVQ:
- https://people.xiph.org/~jm/daala/pvq_demo/
- https://jmvalin.ca/papers/spie_pvq.pdf

The following files are copied as-is from Daala with minimal
adaptations, therefore we disable clang-format on those files
to make it easier to synchronize the AV1 and Daala codebases in the future:
 av1/common/generic_code.c
 av1/common/generic_code.h
 av1/common/laplace_tables.c
 av1/common/partition.c
 av1/common/partition.h
 av1/common/pvq.c
 av1/common/pvq.h
 av1/common/state.c
 av1/common/state.h
 av1/common/zigzag.h
 av1/common/zigzag16.c
 av1/common/zigzag32.c
 av1/common/zigzag4.c
 av1/common/zigzag64.c
 av1/common/zigzag8.c
 av1/decoder/decint.h
 av1/decoder/generic_decoder.c
 av1/decoder/laplace_decoder.c
 av1/decoder/pvq_decoder.c
 av1/decoder/pvq_decoder.h
 av1/encoder/daala_compat_enc.c
 av1/encoder/encint.h
 av1/encoder/generic_encoder.c
 av1/encoder/laplace_encoder.c
 av1/encoder/pvq_encoder.c
 av1/encoder/pvq_encoder.h

Known issues:
- Lossless mode is not supported, '--lossless=1' will give the same result as
'--end-usage=q --cq-level=1'.
- High bit depth is not supported by PVQ.

Change-Id: I1ae0d6517b87f4c1ccea944b2e12dc906979f25e
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 5c8a2f6..aa04389 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -14,6 +14,9 @@
 
 #include "av1/common/entropymv.h"
 #include "av1/common/entropy.h"
+#if CONFIG_PVQ
+#include "av1/encoder/encint.h"
+#endif
 #if CONFIG_REF_MV
 #include "av1/common/mvref_common.h"
 #endif
@@ -22,6 +25,12 @@
 extern "C" {
 #endif
 
+#if CONFIG_PVQ
+// Maximum possible # of tx blocks in luma plane, which is currently 256,
+// since there can be 16x16 of 4x4 tx.
+#define MAX_PVQ_BLOCKS_IN_SB (MAX_SB_SQUARE >> 2 * OD_LOG_BSIZE0)
+#endif
+
 typedef struct {
   unsigned int sse;
   int sum;
@@ -30,6 +39,9 @@
 
 typedef struct macroblock_plane {
   DECLARE_ALIGNED(16, int16_t, src_diff[MAX_SB_SQUARE]);
+#if CONFIG_PVQ
+  DECLARE_ALIGNED(16, int16_t, src_int16[MAX_SB_SQUARE]);
+#endif
   tran_low_t *qcoeff;
   tran_low_t *coeff;
   uint16_t *eobs;
@@ -176,6 +188,25 @@
   int use_default_intra_tx_type;
   // use default transform and skip transform type search for inter modes
   int use_default_inter_tx_type;
+#if CONFIG_PVQ
+  int rate;
+  // 1 if neither AC nor DC is coded. Only used during RDO.
+  int pvq_skip[MAX_MB_PLANE];
+  PVQ_QUEUE *pvq_q;
+
+  // Storage for PVQ tx block encodings in a superblock.
+  // There can be max 16x16 of 4x4 blocks (and YUV) encode by PVQ
+  // 256 is the max # of 4x4 blocks in a SB (64x64), which comes from:
+  // 1) Since PVQ is applied to each trasnform-ed block
+  // 2) 4x4 is the smallest tx size in AV1
+  // 3) AV1 allows using smaller tx size than block (i.e. partition) size
+  // TODO(yushin) : The memory usage could be improved a lot, since this has
+  // storage for 10 bands and 128 coefficients for every 4x4 block,
+  PVQ_INFO pvq[MAX_PVQ_BLOCKS_IN_SB][MAX_MB_PLANE];
+  daala_enc_ctx daala_enc;
+  int pvq_speed;
+  int pvq_coded;  // Indicates whether pvq_info needs be stored to tokenize
+#endif
 };
 
 #ifdef __cplusplus