RTC: Let compiler generate SIMD for scaling in block_yrd_idtx
Performance:
| SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR | SSIM | ENC_T |
|---------|------------|----------|----------|---------|-------|
| 5 | rtc_screen | +0.000% | +0.000% | +0.000% | -2.7% |
|---------|------------|----------|----------|---------|-------|
| 6 | rtc_screen | +0.000% | +0.000% | +0.000% | -2.7% |
|---------|------------|----------|----------|---------|-------|
| 7 | rtc_screen | +0.000% | +0.000% | +0.000% | -3.0% |
|---------|------------|----------|----------|---------|-------|
| 8 | rtc_screen | +0.000% | +0.000% | +0.000% | -3.0% |
|---------|------------|----------|----------|---------|-------|
| 9 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.4% |
|---------|------------|----------|----------|---------|-------|
| 10 | rtc_screen | +0.000% | +0.000% | +0.000% | -0.5% |
Change-Id: Ibcb168d5e530b2dcae089f43de0efdfe4eccd5c7
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 8404112..0085c74 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -1284,6 +1284,38 @@
this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT);
}
+// Explicitly enumerate the cases so the compiler can generate SIMD for the
+// function. According to the disassembler, gcc generates SSE codes for each of
+// the possible block sizes. The hottest case is tx_width 16, which takes up
+// about 8% of the self cycle of av1_nonrd_pick_inter_mode_sb. Since
+// av1_nonrd_pick_inter_mode_sb takes up about 3% of total encoding time, the
+// potential room of improvement for writing AVX2 optimization is only 3% * 8% =
+// 0.24% of total encoding time.
+static AOM_INLINE void scale_square_buf_vals(int16_t *dst, const int tx_width,
+ const int16_t *src,
+ const int src_stride) {
+#define DO_SCALING \
+ do { \
+ for (int idy = 0; idy < tx_width; ++idy) { \
+ for (int idx = 0; idx < tx_width; ++idx) { \
+ dst[idy * tx_width + idx] = src[idy * src_stride + idx] * 8; \
+ } \
+ } \
+ } while (0)
+
+ if (tx_width == 4) {
+ DO_SCALING;
+ } else if (tx_width == 8) {
+ DO_SCALING;
+ } else if (tx_width == 16) {
+ DO_SCALING;
+ } else {
+ assert(0);
+ }
+
+#undef DO_SCALING
+}
+
/*!\brief Calculates RD Cost when the block uses Identity transform.
* Note that thie function is only for low bit depth encoding, since it
* is called in real-time mode for now, which sets high bit depth to 0:
@@ -1353,11 +1385,7 @@
for (int r = 0; r < max_blocks_high; r += block_step) {
for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) {
DECLARE_LOOP_VARS_BLOCK_YRD()
- for (int idy = 0; idy < tx_wd; ++idy) {
- for (int idx = 0; idx < tx_wd; ++idx) {
- low_coeff[idy * tx_wd + idx] = src_diff[idy * diff_stride + idx] * 8;
- }
- }
+ scale_square_buf_vals(low_coeff, tx_wd, src_diff, diff_stride);
av1_quantize_lp(low_coeff, tx_wd * tx_wd, p->round_fp_QTX,
p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX,
eob, scan_order->scan, scan_order->iscan);