DPCM intra coding experiment

Encode a block line by line, horizontally or vertically. In the vertical
mode, each row is predicted by the reconsturcted row above;
in the horizontal mode, each column is predicted by the reconstructed
column to the left.

The DPCM modes are enabled automatically for blocks with horizontal or
vertical prediction mode, and 1D transform types (ext-tx).

Change-Id: I133ab6b537fa24a6e314ee1ef1d2fe9bd9d56c13
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 71d761d..3aa9822 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -1419,6 +1419,301 @@
 #endif
 }
 
+#if CONFIG_DPCM_INTRA
+static int get_eob(const tran_low_t *qcoeff, intptr_t n_coeffs,
+                   const int16_t *scan) {
+  int eob = -1;
+  for (int i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff[rc]) {
+      eob = i;
+      break;
+    }
+  }
+  return eob + 1;
+}
+
+static void quantize_scaler(int coeff, int16_t zbin, int16_t round_value,
+                            int16_t quant, int16_t quant_shift, int16_t dequant,
+                            int log_scale, tran_low_t *const qcoeff,
+                            tran_low_t *const dqcoeff) {
+  zbin = ROUND_POWER_OF_TWO(zbin, log_scale);
+  round_value = ROUND_POWER_OF_TWO(round_value, log_scale);
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  if (abs_coeff >= zbin) {
+    int tmp = clamp(abs_coeff + round_value, INT16_MIN, INT16_MAX);
+    tmp = ((((tmp * quant) >> 16) + tmp) * quant_shift) >> (16 - log_scale);
+    *qcoeff = (tmp ^ coeff_sign) - coeff_sign;
+    *dqcoeff = (*qcoeff * dequant) / (1 << log_scale);
+  }
+}
+
+typedef void (*dpcm_fwd_tx_func)(const int16_t *input, int stride,
+                                 TX_TYPE_1D tx_type, tran_low_t *output);
+
+static dpcm_fwd_tx_func get_dpcm_fwd_tx_func(int tx_length) {
+  switch (tx_length) {
+    case 4: return av1_dpcm_ft4_c;
+    case 8: return av1_dpcm_ft8_c;
+    case 16: return av1_dpcm_ft16_c;
+    case 32:
+      return av1_dpcm_ft32_c;
+    // TODO(huisu): add support for TX_64X64.
+    default: assert(0); return NULL;
+  }
+}
+
+static void process_block_dpcm_vert(TX_SIZE tx_size, TX_TYPE_1D tx_type_1d,
+                                    struct macroblockd_plane *const pd,
+                                    struct macroblock_plane *const p,
+                                    uint8_t *src, int src_stride, uint8_t *dst,
+                                    int dst_stride, int16_t *src_diff,
+                                    int diff_stride, tran_low_t *coeff,
+                                    tran_low_t *qcoeff, tran_low_t *dqcoeff) {
+  const int tx1d_width = tx_size_wide[tx_size];
+  dpcm_fwd_tx_func forward_tx = get_dpcm_fwd_tx_func(tx1d_width);
+  dpcm_inv_txfm_add_func inverse_tx =
+      av1_get_dpcm_inv_txfm_add_func(tx1d_width);
+  const int tx1d_height = tx_size_high[tx_size];
+  const int log_scale = av1_get_tx_scale(tx_size);
+  int q_idx = 0;
+  for (int r = 0; r < tx1d_height; ++r) {
+    // Update prediction.
+    if (r > 0) memcpy(dst, dst - dst_stride, tx1d_width * sizeof(dst[0]));
+    // Subtraction.
+    for (int c = 0; c < tx1d_width; ++c) src_diff[c] = src[c] - dst[c];
+    // Forward transform.
+    forward_tx(src_diff, 1, tx_type_1d, coeff);
+    // Quantization.
+    for (int c = 0; c < tx1d_width; ++c) {
+      quantize_scaler(coeff[c], p->zbin[q_idx], p->round[q_idx],
+                      p->quant[q_idx], p->quant_shift[q_idx],
+                      pd->dequant[q_idx], log_scale, &qcoeff[c], &dqcoeff[c]);
+      q_idx = 1;
+    }
+    // Inverse transform.
+    inverse_tx(dqcoeff, 1, tx_type_1d, dst);
+    // Move to the next row.
+    coeff += tx1d_width;
+    qcoeff += tx1d_width;
+    dqcoeff += tx1d_width;
+    src_diff += diff_stride;
+    dst += dst_stride;
+    src += src_stride;
+  }
+}
+
+static void process_block_dpcm_horz(TX_SIZE tx_size, TX_TYPE_1D tx_type_1d,
+                                    struct macroblockd_plane *const pd,
+                                    struct macroblock_plane *const p,
+                                    uint8_t *src, int src_stride, uint8_t *dst,
+                                    int dst_stride, int16_t *src_diff,
+                                    int diff_stride, tran_low_t *coeff,
+                                    tran_low_t *qcoeff, tran_low_t *dqcoeff) {
+  const int tx1d_height = tx_size_high[tx_size];
+  dpcm_fwd_tx_func forward_tx = get_dpcm_fwd_tx_func(tx1d_height);
+  dpcm_inv_txfm_add_func inverse_tx =
+      av1_get_dpcm_inv_txfm_add_func(tx1d_height);
+  const int tx1d_width = tx_size_wide[tx_size];
+  const int log_scale = av1_get_tx_scale(tx_size);
+  int q_idx = 0;
+  for (int c = 0; c < tx1d_width; ++c) {
+    for (int r = 0; r < tx1d_height; ++r) {
+      // Update prediction.
+      if (c > 0) dst[r * dst_stride] = dst[r * dst_stride - 1];
+      // Subtraction.
+      src_diff[r * diff_stride] = src[r * src_stride] - dst[r * dst_stride];
+    }
+    // Forward transform.
+    tran_low_t tx_buff[64];
+    forward_tx(src_diff, diff_stride, tx_type_1d, tx_buff);
+    for (int r = 0; r < tx1d_height; ++r) coeff[r * tx1d_width] = tx_buff[r];
+    // Quantization.
+    for (int r = 0; r < tx1d_height; ++r) {
+      quantize_scaler(coeff[r * tx1d_width], p->zbin[q_idx], p->round[q_idx],
+                      p->quant[q_idx], p->quant_shift[q_idx],
+                      pd->dequant[q_idx], log_scale, &qcoeff[r * tx1d_width],
+                      &dqcoeff[r * tx1d_width]);
+      q_idx = 1;
+    }
+    // Inverse transform.
+    for (int r = 0; r < tx1d_height; ++r) tx_buff[r] = dqcoeff[r * tx1d_width];
+    inverse_tx(tx_buff, dst_stride, tx_type_1d, dst);
+    // Move to the next column.
+    ++coeff, ++qcoeff, ++dqcoeff, ++src_diff, ++dst, ++src;
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void hbd_process_block_dpcm_vert(
+    TX_SIZE tx_size, TX_TYPE_1D tx_type_1d, int bd,
+    struct macroblockd_plane *const pd, struct macroblock_plane *const p,
+    uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride,
+    int16_t *src_diff, int diff_stride, tran_low_t *coeff, tran_low_t *qcoeff,
+    tran_low_t *dqcoeff) {
+  const int tx1d_width = tx_size_wide[tx_size];
+  dpcm_fwd_tx_func forward_tx = get_dpcm_fwd_tx_func(tx1d_width);
+  hbd_dpcm_inv_txfm_add_func inverse_tx =
+      av1_get_hbd_dpcm_inv_txfm_add_func(tx1d_width);
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  const int tx1d_height = tx_size_high[tx_size];
+  const int log_scale = av1_get_tx_scale(tx_size);
+  int q_idx = 0;
+  for (int r = 0; r < tx1d_height; ++r) {
+    // Update prediction.
+    if (r > 0) memcpy(dst, dst - dst_stride, tx1d_width * sizeof(dst[0]));
+    // Subtraction.
+    for (int c = 0; c < tx1d_width; ++c) src_diff[c] = src[c] - dst[c];
+    // Forward transform.
+    forward_tx(src_diff, 1, tx_type_1d, coeff);
+    // Quantization.
+    for (int c = 0; c < tx1d_width; ++c) {
+      quantize_scaler(coeff[c], p->zbin[q_idx], p->round[q_idx],
+                      p->quant[q_idx], p->quant_shift[q_idx],
+                      pd->dequant[q_idx], log_scale, &qcoeff[c], &dqcoeff[c]);
+      q_idx = 1;
+    }
+    // Inverse transform.
+    inverse_tx(dqcoeff, 1, tx_type_1d, bd, dst);
+    // Move to the next row.
+    coeff += tx1d_width;
+    qcoeff += tx1d_width;
+    dqcoeff += tx1d_width;
+    src_diff += diff_stride;
+    dst += dst_stride;
+    src += src_stride;
+  }
+}
+
+static void hbd_process_block_dpcm_horz(
+    TX_SIZE tx_size, TX_TYPE_1D tx_type_1d, int bd,
+    struct macroblockd_plane *const pd, struct macroblock_plane *const p,
+    uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride,
+    int16_t *src_diff, int diff_stride, tran_low_t *coeff, tran_low_t *qcoeff,
+    tran_low_t *dqcoeff) {
+  const int tx1d_height = tx_size_high[tx_size];
+  dpcm_fwd_tx_func forward_tx = get_dpcm_fwd_tx_func(tx1d_height);
+  hbd_dpcm_inv_txfm_add_func inverse_tx =
+      av1_get_hbd_dpcm_inv_txfm_add_func(tx1d_height);
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  const int tx1d_width = tx_size_wide[tx_size];
+  const int log_scale = av1_get_tx_scale(tx_size);
+  int q_idx = 0;
+  for (int c = 0; c < tx1d_width; ++c) {
+    for (int r = 0; r < tx1d_height; ++r) {
+      // Update prediction.
+      if (c > 0) dst[r * dst_stride] = dst[r * dst_stride - 1];
+      // Subtraction.
+      src_diff[r * diff_stride] = src[r * src_stride] - dst[r * dst_stride];
+    }
+    // Forward transform.
+    tran_low_t tx_buff[64];
+    forward_tx(src_diff, diff_stride, tx_type_1d, tx_buff);
+    for (int r = 0; r < tx1d_height; ++r) coeff[r * tx1d_width] = tx_buff[r];
+    // Quantization.
+    for (int r = 0; r < tx1d_height; ++r) {
+      quantize_scaler(coeff[r * tx1d_width], p->zbin[q_idx], p->round[q_idx],
+                      p->quant[q_idx], p->quant_shift[q_idx],
+                      pd->dequant[q_idx], log_scale, &qcoeff[r * tx1d_width],
+                      &dqcoeff[r * tx1d_width]);
+      q_idx = 1;
+    }
+    // Inverse transform.
+    for (int r = 0; r < tx1d_height; ++r) tx_buff[r] = dqcoeff[r * tx1d_width];
+    inverse_tx(tx_buff, dst_stride, tx_type_1d, bd, dst);
+    // Move to the next column.
+    ++coeff, ++qcoeff, ++dqcoeff, ++src_diff, ++dst, ++src;
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+void av1_encode_block_intra_dpcm(const AV1_COMMON *cm, MACROBLOCK *x,
+                                 PREDICTION_MODE mode, int plane, int block,
+                                 int blk_row, int blk_col,
+                                 BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                 TX_TYPE tx_type, ENTROPY_CONTEXT *ta,
+                                 ENTROPY_CONTEXT *tl, int8_t *skip) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  const int tx1d_width = tx_size_wide[tx_size];
+  const int tx1d_height = tx_size_high[tx_size];
+  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, 0);
+  tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  uint8_t *dst =
+      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  uint8_t *src =
+      &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+  int16_t *src_diff =
+      &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+  uint16_t *eob = &p->eobs[block];
+  *eob = 0;
+  memset(qcoeff, 0, tx1d_height * tx1d_width * sizeof(*qcoeff));
+  memset(dqcoeff, 0, tx1d_height * tx1d_width * sizeof(*dqcoeff));
+
+  if (LIKELY(!x->skip_block)) {
+    TX_TYPE_1D tx_type_1d = DCT_1D;
+    switch (tx_type) {
+      case IDTX: tx_type_1d = IDTX_1D; break;
+      case V_DCT:
+        assert(mode == H_PRED);
+        tx_type_1d = DCT_1D;
+        break;
+      case H_DCT:
+        assert(mode == V_PRED);
+        tx_type_1d = DCT_1D;
+        break;
+      default: assert(0);
+    }
+    switch (mode) {
+      case V_PRED:
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          hbd_process_block_dpcm_vert(tx_size, tx_type_1d, xd->bd, pd, p, src,
+                                      src_stride, dst, dst_stride, src_diff,
+                                      diff_stride, coeff, qcoeff, dqcoeff);
+        } else {
+#endif  // CONFIG_HIGHBITDEPTH
+          process_block_dpcm_vert(tx_size, tx_type_1d, pd, p, src, src_stride,
+                                  dst, dst_stride, src_diff, diff_stride, coeff,
+                                  qcoeff, dqcoeff);
+#if CONFIG_HIGHBITDEPTH
+        }
+#endif  // CONFIG_HIGHBITDEPTH
+        break;
+      case H_PRED:
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          hbd_process_block_dpcm_horz(tx_size, tx_type_1d, xd->bd, pd, p, src,
+                                      src_stride, dst, dst_stride, src_diff,
+                                      diff_stride, coeff, qcoeff, dqcoeff);
+        } else {
+#endif  // CONFIG_HIGHBITDEPTH
+          process_block_dpcm_horz(tx_size, tx_type_1d, pd, p, src, src_stride,
+                                  dst, dst_stride, src_diff, diff_stride, coeff,
+                                  qcoeff, dqcoeff);
+#if CONFIG_HIGHBITDEPTH
+        }
+#endif  // CONFIG_HIGHBITDEPTH
+        break;
+      default: assert(0);
+    }
+    *eob = get_eob(qcoeff, tx1d_height * tx1d_width, scan_order->scan);
+  }
+
+  ta[blk_col] = tl[blk_row] = *eob > 0;
+  if (*eob) *skip = 0;
+}
+#endif  // CONFIG_DPCM_INTRA
+
 void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                             void *arg) {
@@ -1448,6 +1743,20 @@
 #else
   av1_predict_intra_block_facade(xd, plane, block, blk_col, blk_row, tx_size);
 #endif
+
+#if CONFIG_DPCM_INTRA
+  const int block_raster_idx = av1_block_index_to_raster_order(tx_size, block);
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const PREDICTION_MODE mode =
+      (plane == 0) ? get_y_mode(xd->mi[0], block_raster_idx) : mbmi->uv_mode;
+  if (av1_use_dpcm_intra(plane, mode, tx_type, mbmi)) {
+    av1_encode_block_intra_dpcm(cm, x, mode, plane, block, blk_row, blk_col,
+                                plane_bsize, tx_size, tx_type, args->ta,
+                                args->tl, args->skip);
+    return;
+  }
+#endif  // CONFIG_DPCM_INTRA
+
   av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
 
   const ENTROPY_CONTEXT *a = &args->ta[blk_col];