Enable idtx in rtc for screen content mode

This mode is only enabled in the screen content mode. Other settings
remain identical. The coding performance gains for speed 10 are
            avg PSNR  overall PSNR   SSIM    Speed change
rtc_screen   -18.7%    -14.9%       -20.9%     -13.0%
For speed 8, the changes are
            avg PSNR  overall PSNR   SSIM    Speed change
rtc_screen   -15.7%    -13.2%       -17.5%     -15.7%
The PSNR gains for the same bit-rate are in the range of 0.5-1.5dB.

STATS_CHANGED

Change-Id: I3341f6efc3bae54e313b0f2451592c11b3f19e27
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 59fba50..fce08f1 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1119,6 +1119,8 @@
   add_proto qw/void aom_hadamard_lp_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
   specialize qw/aom_hadamard_lp_16x16 sse2 avx2 neon/;
 
+  add_proto qw/void aom_pixel_scale/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff, int log_scale, int h8, int w8";
+  specialize qw/aom_pixel_scale sse2/;
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void aom_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
diff --git a/aom_dsp/avg.c b/aom_dsp/avg.c
index b88d48c..508db4d 100644
--- a/aom_dsp/avg.c
+++ b/aom_dsp/avg.c
@@ -88,6 +88,14 @@
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
+void aom_pixel_scale_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                       int16_t *coeff, int log_scale, int h8, int w8) {
+  for (int idy = 0; idy < h8 * 8; ++idy)
+    for (int idx = 0; idx < w8 * 8; ++idx)
+      coeff[idy * (h8 * 8) + idx] = src_diff[idy * src_stride + idx]
+                                    << log_scale;
+}
+
 static void hadamard_col4(const int16_t *src_diff, ptrdiff_t src_stride,
                           int16_t *coeff) {
   int16_t b0 = (src_diff[0 * src_stride] + src_diff[1 * src_stride]) >> 1;
diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index c4e4dbc..c390967 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -314,6 +314,56 @@
   hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
 }
 
+void aom_pixel_scale_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                          int16_t *coeff, int log_scale, int h8, int w8) {
+  __m128i src[8];
+  const int16_t *org_src_diff = src_diff;
+  int16_t *org_coeff = coeff;
+  int coeff_stride = w8 << 3;
+  for (int idy = 0; idy < h8; ++idy) {
+    for (int idx = 0; idx < w8; ++idx) {
+      src_diff = org_src_diff + (idx << 3);
+      coeff = org_coeff + (idx << 3);
+
+      src[0] = _mm_load_si128((const __m128i *)src_diff);
+      src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+      src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+      src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+      src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+      src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+      src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+      src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+
+      src[0] = _mm_slli_epi16(src[0], log_scale);
+      src[1] = _mm_slli_epi16(src[1], log_scale);
+      src[2] = _mm_slli_epi16(src[2], log_scale);
+      src[3] = _mm_slli_epi16(src[3], log_scale);
+      src[4] = _mm_slli_epi16(src[4], log_scale);
+      src[5] = _mm_slli_epi16(src[5], log_scale);
+      src[6] = _mm_slli_epi16(src[6], log_scale);
+      src[7] = _mm_slli_epi16(src[7], log_scale);
+
+      _mm_store_si128((__m128i *)coeff, src[0]);
+      coeff += coeff_stride;
+      _mm_store_si128((__m128i *)coeff, src[1]);
+      coeff += coeff_stride;
+      _mm_store_si128((__m128i *)coeff, src[2]);
+      coeff += coeff_stride;
+      _mm_store_si128((__m128i *)coeff, src[3]);
+      coeff += coeff_stride;
+      _mm_store_si128((__m128i *)coeff, src[4]);
+      coeff += coeff_stride;
+      _mm_store_si128((__m128i *)coeff, src[5]);
+      coeff += coeff_stride;
+      _mm_store_si128((__m128i *)coeff, src[6]);
+      coeff += coeff_stride;
+      _mm_store_si128((__m128i *)coeff, src[7]);
+    }
+    org_src_diff += (src_stride << 3);
+    org_coeff += (coeff_stride << 3);
+  }
+}
+
 static INLINE void hadamard_lp_8x8_sse2(const int16_t *src_diff,
                                         ptrdiff_t src_stride, int16_t *coeff) {
   __m128i src[8];
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index be920f6..5bc2357 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -15,6 +15,8 @@
 #include <math.h>
 #include <stdio.h>
 
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/blockd.h"
 #include "config/aom_dsp_rtcd.h"
 #include "config/av1_rtcd.h"
 
@@ -49,6 +51,7 @@
   PRED_BUFFER *best_pred;
   PREDICTION_MODE best_mode;
   TX_SIZE best_tx_size;
+  TX_TYPE tx_type;
   MV_REFERENCE_FRAME best_ref_frame;
   MV_REFERENCE_FRAME best_second_ref_frame;
   uint8_t best_mode_skip_txfm;
@@ -161,6 +164,7 @@
   bp->best_ref_frame = LAST_FRAME;
   bp->best_second_ref_frame = NONE_FRAME;
   bp->best_tx_size = TX_8X8;
+  bp->tx_type = DCT_DCT;
   bp->best_pred_filter = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
   bp->best_mode_skip_txfm = 0;
   bp->best_mode_initial_skip_flag = 0;
@@ -787,14 +791,15 @@
  * \param[in]    skippable      Pointer to a flag indicating possible tx skip
  * \param[in]    bsize          Current block size
  * \param[in]    tx_size        Transform size
+ * \param[in]    tx_type        Transform kernel type
  *
  * \return Nothing is returned. Instead, calculated RD cost is placed to
  * \c this_rdc. \c skippable flag is set if there is no non-zero quantized
  * coefficients for Hadamard transform
  */
-static void block_yrd(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col,
-                      RD_STATS *this_rdc, int *skippable, BLOCK_SIZE bsize,
-                      TX_SIZE tx_size) {
+void av1_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
+                   int mi_col, RD_STATS *this_rdc, int *skippable,
+                   BLOCK_SIZE bsize, TX_SIZE tx_size, TX_TYPE tx_type) {
   MACROBLOCKD *xd = &x->e_mbd;
   const struct macroblockd_plane *pd = &xd->plane[0];
   struct macroblock_plane *const p = &x->plane[0];
@@ -871,7 +876,11 @@
                               dqcoeff, p->dequant_QTX, eob, scan_order->scan,
                               scan_order->iscan);
             } else {
-              aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
+              if (tx_type == IDTX) {
+                aom_pixel_scale(src_diff, diff_stride, low_coeff, 3, 2, 2);
+              } else {
+                aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
+              }
               av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX,
                               p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
                               p->dequant_QTX, eob, scan_order->scan,
@@ -886,7 +895,11 @@
                               dqcoeff, p->dequant_QTX, eob, scan_order->scan,
                               scan_order->iscan);
             } else {
-              aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
+              if (tx_type == IDTX) {
+                aom_pixel_scale(src_diff, diff_stride, low_coeff, 3, 1, 1);
+              } else {
+                aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
+              }
               av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX,
                               p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
                               p->dequant_QTX, eob, scan_order->scan,
@@ -902,7 +915,14 @@
                               dqcoeff, p->dequant_QTX, eob, scan_order->scan,
                               scan_order->iscan);
             } else {
-              aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
+              if (tx_type == IDTX) {
+                for (int idy = 0; idy < 4; ++idy)
+                  for (int idx = 0; idx < 4; ++idx)
+                    low_coeff[idy * 4 + idx] = src_diff[idy * diff_stride + idx]
+                                               << 3;
+              } else {
+                aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
+              }
               av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX,
                               p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
                               p->dequant_QTX, eob, scan_order->scan,
@@ -911,21 +931,43 @@
             break;
 #else
           case TX_16X16:
-            aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
+            if (tx_type == IDTX) {
+              for (int idy = 0; idy < 16; ++idy)
+                for (int idx = 0; idx < 16; ++idx)
+                  low_coeff[idy * 16 + idx] =
+                      src_diff[idy * diff_stride + idx] * 8;
+            } else {
+              aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
+            }
             av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX,
                             p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
                             p->dequant_QTX, eob, scan_order->scan,
                             scan_order->iscan);
             break;
           case TX_8X8:
-            aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
+            if (tx_type == IDTX) {
+              for (int idy = 0; idy < 8; ++idy)
+                for (int idx = 0; idx < 8; ++idx)
+                  low_coeff[idy * 8 + idx] =
+                      src_diff[idy * diff_stride + idx] * 8;
+            } else {
+              aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
+            }
             av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX,
                             low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
                             scan_order->scan, scan_order->iscan);
             break;
           default:
             assert(tx_size == TX_4X4);
-            aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
+            if (tx_type == IDTX) {
+              for (int idy = 0; idy < 4; ++idy)
+                for (int idx = 0; idx < 4; ++idx)
+                  low_coeff[idy * 4 + idx] =
+                      src_diff[idy * diff_stride + idx] * 8;
+            } else {
+              aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
+            }
+
             av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX,
                             low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
                             scan_order->scan, scan_order->iscan);
@@ -1267,8 +1309,8 @@
   pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
 
   if (plane == 0) {
-    block_yrd(cpi, x, 0, 0, &this_rdc, &args->skippable, bsize_tx,
-              AOMMIN(tx_size, TX_16X16));
+    av1_block_yrd(cpi, x, 0, 0, &this_rdc, &args->skippable, bsize_tx,
+                  AOMMIN(tx_size, TX_16X16), DCT_DCT);
   } else {
     int64_t sse = 0;
     model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &this_rdc, &sse, plane, plane);
@@ -2067,8 +2109,8 @@
     if (use_modeled_non_rd_cost)
       model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, 1);
     else
-      block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &args.skippable, bsize,
-                mi->tx_size);
+      av1_block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &args.skippable, bsize,
+                    mi->tx_size, DCT_DCT);
     // TODO(kyslov@) Need to account for skippable
     if (x->color_sensitivity[0]) {
       av1_foreach_transformed_block_in_plane(xd, uv_bsize, 1,
@@ -2797,8 +2839,8 @@
           this_rdc.rate += no_skip_txfm_cost;
         }
       } else {
-        block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &is_skippable, bsize,
-                  mi->tx_size);
+        av1_block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &is_skippable, bsize,
+                      mi->tx_size, DCT_DCT);
         if (this_rdc.skip_txfm ||
             RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist) >=
                 RDCOST(x->rdmult, 0, this_rdc.sse)) {
@@ -2940,6 +2982,37 @@
                       &orig_dst, tmp, &this_mode_pred, &best_rdc,
                       &best_pickmode);
 
+  if (is_inter_mode(best_pickmode.best_mode) &&
+      cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+    RD_STATS idtx_rdc;
+    av1_init_rd_stats(&this_rdc);
+    av1_init_rd_stats(&idtx_rdc);
+    int is_skippable;
+
+    this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+    pd->dst.buf = this_mode_pred->data;
+    pd->dst.stride = bw;
+
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, 0);
+    av1_block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &is_skippable, bsize,
+                  mi->tx_size, DCT_DCT);
+    av1_block_yrd(cpi, x, mi_row, mi_col, &idtx_rdc, &is_skippable, bsize,
+                  mi->tx_size, IDTX);
+    int64_t dct_rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+    int64_t idx_rdcost = RDCOST(x->rdmult, idtx_rdc.rate, idtx_rdc.dist);
+    if (idx_rdcost < dct_rdcost) {
+      best_pickmode.tx_type = IDTX;
+      best_rdc.rate -= this_rdc.rate - idtx_rdc.rate;
+      best_rdc.dist -= this_rdc.dist - idtx_rdc.dist;
+      best_rdc.rdcost -= dct_rdcost - idx_rdcost;
+    }
+    pd->dst = orig_dst;
+  }
+
+  xd->tx_type_map[0] = best_pickmode.tx_type;
+  memset(ctx->tx_type_map, best_pickmode.tx_type, ctx->num_4x4_blk);
+  memset(xd->tx_type_map, best_pickmode.tx_type, ctx->num_4x4_blk);
+
   int try_palette =
       cpi->oxcf.tool_cfg.enable_palette &&
       av1_allow_palette(cpi->common.features.allow_screen_content_tools,
diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index 75747a8..c91bcb6 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c
@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "aom_dsp/txfm_common.h"
+
 #include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/enums.h"
@@ -2057,6 +2059,7 @@
            ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >>
             (subsampling_x + subsampling_y)));
   }
+
   encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate);
   if (!dry_run) {
     update_cb_offsets(x, bsize, subsampling_x, subsampling_y);
diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index 3b20e6e..eeccb3d 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h
@@ -166,6 +166,10 @@
 void av1_inter_mode_data_init(struct TileDataEnc *tile_data);
 void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult);
 
+void av1_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
+                   int mi_col, RD_STATS *this_rdc, int *skippable,
+                   BLOCK_SIZE bsize, TX_SIZE tx_size, TX_TYPE tx_type);
+
 static INLINE int coded_to_superres_mi(int mi_col, int denom) {
   return (mi_col * denom + SCALE_NUMERATOR / 2) / SCALE_NUMERATOR;
 }