RTC: Let compiler generate SIMD for scaling in block_yrd_idtx Performance: | SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR | SSIM | ENC_T | |---------|------------|----------|----------|---------|-------| | 5 | rtc_screen | +0.000% | +0.000% | +0.000% | -2.7% | |---------|------------|----------|----------|---------|-------| | 6 | rtc_screen | +0.000% | +0.000% | +0.000% | -2.7% | |---------|------------|----------|----------|---------|-------| | 7 | rtc_screen | +0.000% | +0.000% | +0.000% | -3.0% | |---------|------------|----------|----------|---------|-------| | 8 | rtc_screen | +0.000% | +0.000% | +0.000% | -3.0% | |---------|------------|----------|----------|---------|-------| | 9 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.4% | |---------|------------|----------|----------|---------|-------| | 10 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.5% | Change-Id: Ibcb168d5e530b2dcae089f43de0efdfe4eccd5c7

commit: d7fc49b8b6b5663ec93b6850839e3451e3dd0339 [log] [tgz]
author: chiyotsai <chiyotsai@google.com> Tue Nov 22 13:50:39 2022 -0800
committer: chiyotsai <chiyotsai@google.com> Wed Nov 23 13:46:30 2022 -0800
tree: 9f28d745d42b7ccb958cec0c8dbe448d01161656
parent: bf131b8060eeb0b6bb4704e32a75e7466bf8e1d6 [diff]
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 8404112..0085c74 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c

@@ -1284,6 +1284,38 @@
   this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT);
 }
 
+// Explicitly enumerate the cases so the compiler can generate SIMD for the
+// function. According to the disassembler, gcc generates SSE codes for each of
+// the possible block sizes. The hottest case is tx_width 16, which takes up
+// about 8% of the self cycle of av1_nonrd_pick_inter_mode_sb. Since
+// av1_nonrd_pick_inter_mode_sb takes up about 3% of total encoding time, the
+// potential room of improvement for writing AVX2 optimization is only 3% * 8% =
+// 0.24% of total encoding time.
+static AOM_INLINE void scale_square_buf_vals(int16_t *dst, const int tx_width,
+                                             const int16_t *src,
+                                             const int src_stride) {
+#define DO_SCALING                                                   \
+  do {                                                               \
+    for (int idy = 0; idy < tx_width; ++idy) {                       \
+      for (int idx = 0; idx < tx_width; ++idx) {                     \
+        dst[idy * tx_width + idx] = src[idy * src_stride + idx] * 8; \
+      }                                                              \
+    }                                                                \
+  } while (0)
+
+  if (tx_width == 4) {
+    DO_SCALING;
+  } else if (tx_width == 8) {
+    DO_SCALING;
+  } else if (tx_width == 16) {
+    DO_SCALING;
+  } else {
+    assert(0);
+  }
+
+#undef DO_SCALING
+}
+
 /*!\brief Calculates RD Cost when the block uses Identity transform.
  * Note that thie function is only for low bit depth encoding, since it
  * is called in real-time mode for now, which sets high bit depth to 0:
@@ -1353,11 +1385,7 @@
   for (int r = 0; r < max_blocks_high; r += block_step) {
     for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) {
       DECLARE_LOOP_VARS_BLOCK_YRD()
-      for (int idy = 0; idy < tx_wd; ++idy) {
-        for (int idx = 0; idx < tx_wd; ++idx) {
-          low_coeff[idy * tx_wd + idx] = src_diff[idy * diff_stride + idx] * 8;
-        }
-      }
+      scale_square_buf_vals(low_coeff, tx_wd, src_diff, diff_stride);
       av1_quantize_lp(low_coeff, tx_wd * tx_wd, p->round_fp_QTX,
                       p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX,
                       eob, scan_order->scan, scan_order->iscan);
commit	d7fc49b8b6b5663ec93b6850839e3451e3dd0339	[log] [tgz]
author	chiyotsai <chiyotsai@google.com>	Tue Nov 22 13:50:39 2022 -0800
committer	chiyotsai <chiyotsai@google.com>	Wed Nov 23 13:46:30 2022 -0800
tree	9f28d745d42b7ccb958cec0c8dbe448d01161656
parent	bf131b8060eeb0b6bb4704e32a75e7466bf8e1d6 [diff]