Tweak rdmult with `tune=ssimulacra2`

Tweak rdmult derivation in two places:
1. When computing rdmult in many parts of the encoder:
   - `av1_compute_rd_mult()`
   - `av1_compute_rd_mult_based_on_qindex()`
2. While performing trellis quantization:
   - `av1_optimize_txb()`

These two tweaks work synergistically to improve subjective
quality and SSIMULACRA2 scores.

Approximate BD-Rate gains over no rdmult tweaks - cpu-used=6
(Daala's subset1):
- SSIMULACRA2 60: -1.2%
- SSIMULACRA2 70: -1.4%
- SSIMULACRA2 80: -1.2%
- SSIMULACRA2 90: -1.3%

Bug: aomedia:375221136
Change-Id: I559ee11f1924e6ea069e56b2accbfd00f41c24c7
diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c
index 5c08726..ea337e3 100644
--- a/av1/encoder/aq_cyclicrefresh.c
+++ b/av1/encoder/aq_cyclicrefresh.c
@@ -645,7 +645,7 @@
         qindex2, cm->seq_params->bit_depth,
         cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
         boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
-        is_stat_consumption_stage(cpi));
+        is_stat_consumption_stage(cpi), cpi->oxcf.tune_cfg.tuning);
 
     av1_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
 
diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index f948405..12f58e3 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c
@@ -815,7 +815,7 @@
       qindex_rdmult, cm->seq_params->bit_depth,
       cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
       boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
-      is_stat_consumption_stage(cpi));
+      is_stat_consumption_stage(cpi), cpi->oxcf.tune_cfg.tuning);
 
   const int qindex_change = x->qindex != qindex;
   if (qindex_change || do_update) {
diff --git a/av1/encoder/encodeframe_utils.h b/av1/encoder/encodeframe_utils.h
index 05afd61..46d36c8 100644
--- a/av1/encoder/encodeframe_utils.h
+++ b/av1/encoder/encodeframe_utils.h
@@ -317,7 +317,8 @@
 
   return av1_compute_rd_mult(
       qindex, bit_depth, update_type, layer_depth, boost_index, frame_type,
-      cpi->oxcf.q_cfg.use_fixed_qp_offsets, is_stat_consumption_stage(cpi));
+      cpi->oxcf.q_cfg.use_fixed_qp_offsets, is_stat_consumption_stage(cpi),
+      cpi->oxcf.tune_cfg.tuning);
 }
 
 static inline int do_split_check(BLOCK_SIZE bsize) {
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 2d623fa..1067288 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -3382,7 +3382,8 @@
 
     // Note: Both use common rdmult based on base qindex of fullres.
     const int64_t rdmult = av1_compute_rd_mult_based_on_qindex(
-        bit_depth, update_type, cm->quant_params.base_qindex);
+        bit_depth, update_type, cm->quant_params.base_qindex,
+        cpi->oxcf.tune_cfg.tuning);
 
     // Find the best rdcost among all superres denoms.
     int best_denom = -1;
@@ -3446,7 +3447,8 @@
 
     // Note: Both use common rdmult based on base qindex of fullres.
     const int64_t rdmult = av1_compute_rd_mult_based_on_qindex(
-        bit_depth, update_type, cm->quant_params.base_qindex);
+        bit_depth, update_type, cm->quant_params.base_qindex,
+        cpi->oxcf.tune_cfg.tuning);
     proj_rdcost1 =
         RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate1, sse1, bit_depth);
     const double proj_rdcost2 =
diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index e77dbc7..2c26688 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c
@@ -386,7 +386,7 @@
 
 int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth,
                                         FRAME_UPDATE_TYPE update_type,
-                                        int qindex) {
+                                        int qindex, aom_tune_metric tuning) {
   const int q = av1_dc_quant_QTX(qindex, 0, bit_depth);
   int64_t rdmult = q * q;
   if (update_type == KF_UPDATE) {
@@ -400,6 +400,25 @@
     rdmult = (int64_t)((double)rdmult * def_rd_q_mult);
   }
 
+  if (tuning == AOM_TUNE_SSIMULACRA2) {
+    // Further multiply rdmult (by up to 200/128 = 1.5625) to improve image
+    // quality. The most noticeable effect is a mild bias towards choosing
+    // larger transform sizes (e.g. one 16x16 transform instead of 4 8x8
+    // transforms).
+    // For very high qindexes, start progressively reducing the weight towards
+    // unity (128/128), as transforms are large enough and making them even
+    // larger actually harms subjective quality and SSIMULACRA 2 scores.
+    // This weight part of the equation was determined by iteratively increasing
+    // weight on CID22 and Daala's subset1, and observing its effects on visual
+    // quality and SSIMULACRA 2 scores along the usable (0-100) range.
+    // The ramp-down part of the equation was determined by choosing a fixed
+    // initial qindex point [qindex 159 = (255 - 159) * 3 / 4] where SSIMULACRA
+    // 2 scores for encodes with qindexes greater than 159 scored at or above
+    // their equivalents with no rdmult adjustment.
+    const int weight = clamp(((255 - qindex) * 3) / 4, 0, 72) + 128;
+    rdmult = (int64_t)((double)rdmult * weight / 128.0);
+  }
+
   switch (bit_depth) {
     case AOM_BITS_8: break;
     case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break;
@@ -416,9 +435,10 @@
                         const int layer_depth, const int boost_index,
                         const FRAME_TYPE frame_type,
                         const int use_fixed_qp_offsets,
-                        const int is_stat_consumption_stage) {
-  int64_t rdmult =
-      av1_compute_rd_mult_based_on_qindex(bit_depth, update_type, qindex);
+                        const int is_stat_consumption_stage,
+                        const aom_tune_metric tuning) {
+  int64_t rdmult = av1_compute_rd_mult_based_on_qindex(bit_depth, update_type,
+                                                       qindex, tuning);
   if (is_stat_consumption_stage && !use_fixed_qp_offsets &&
       (frame_type != KEY_FRAME)) {
     // Layer depth adjustment
@@ -426,7 +446,7 @@
     // ARF boost adjustment
     rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
   }
-  return (int)rdmult;
+  return rdmult > 0 ? (int)AOMMIN(rdmult, INT_MAX) : 1;
 }
 
 int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta) {
@@ -486,7 +506,7 @@
                    cpi->ppi->gf_group.update_type[cpi->gf_frame_index],
                    layer_depth, boost_index, frame_type,
                    cpi->oxcf.q_cfg.use_fixed_qp_offsets,
-                   is_stat_consumption_stage(cpi)) /
+                   is_stat_consumption_stage(cpi), cpi->oxcf.tune_cfg.tuning) /
                beta);
 }
 #endif  // !CONFIG_REALTIME_ONLY
@@ -778,7 +798,7 @@
       qindex_rdmult, cm->seq_params->bit_depth,
       cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
       boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
-      is_stat_consumption_stage(cpi));
+      is_stat_consumption_stage(cpi), cpi->oxcf.tune_cfg.tuning);
 #if CONFIG_RD_COMMAND
   if (cpi->oxcf.pass == 2) {
     const RD_COMMAND *rd_command = &cpi->rd_command;
diff --git a/av1/encoder/rd.h b/av1/encoder/rd.h
index d4db276..0b79fd9 100644
--- a/av1/encoder/rd.h
+++ b/av1/encoder/rd.h
@@ -14,8 +14,8 @@
 
 #include <limits.h>
 
+#include "aom/aomcx.h"
 #include "av1/common/blockd.h"
-
 #include "av1/encoder/block.h"
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/cost.h"
@@ -232,19 +232,21 @@
  * \param[in]       bit_depth       bit depth
  * \param[in]       update_type     frame update type
  * \param[in]       qindex          q index
+ * \param[in]       tuning          visual tuning metric
  *
  * \return rdmult
  */
 int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth,
                                         FRAME_UPDATE_TYPE update_type,
-                                        int qindex);
+                                        int qindex, aom_tune_metric tuning);
 
 int av1_compute_rd_mult(const int qindex, const aom_bit_depth_t bit_depth,
                         const FRAME_UPDATE_TYPE update_type,
                         const int layer_depth, const int boost_index,
                         const FRAME_TYPE frame_type,
                         const int use_fixed_qp_offsets,
-                        const int is_stat_consumption_stage);
+                        const int is_stat_consumption_stage,
+                        const aom_tune_metric tuning);
 
 void av1_initialize_rd_consts(struct AV1_COMP *cpi);
 
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 202ac1f..a70f0c9 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -1375,12 +1375,17 @@
 
   const int base_qindex =
       cpi->use_ducky_encode ? gf_group->q_val[frame_idx] : pframe_qindex;
+  // The TPL model is only meant to be run in inter mode, so ensure that we are
+  // not running in all intra mode, which implies we are not tuning for
+  // SSIMULACRA 2.
+  assert(cpi->oxcf.tune_cfg.tuning != AOM_TUNE_SSIMULACRA2 &&
+         cpi->oxcf.mode != ALLINTRA);
   // Get rd multiplier set up.
-  rdmult = (int)av1_compute_rd_mult(
+  rdmult = av1_compute_rd_mult(
       base_qindex, cm->seq_params->bit_depth,
       cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
       boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
-      is_stat_consumption_stage(cpi));
+      is_stat_consumption_stage(cpi), cpi->oxcf.tune_cfg.tuning);
 
   if (rdmult < 1) rdmult = 1;
   av1_set_error_per_bit(&x->errorperbit, rdmult);
@@ -1395,7 +1400,8 @@
   const FRAME_UPDATE_TYPE update_type =
       gf_group->update_type[cpi->gf_frame_index];
   tpl_frame->base_rdmult = av1_compute_rd_mult_based_on_qindex(
-                               bd_info.bit_depth, update_type, base_qindex) /
+                               bd_info.bit_depth, update_type, base_qindex,
+                               cpi->oxcf.tune_cfg.tuning) /
                            6;
 
   if (cpi->use_ducky_encode)
@@ -2105,7 +2111,7 @@
       orig_qindex_rdmult, cm->seq_params->bit_depth,
       cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
       boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
-      is_stat_consumption_stage(cpi));
+      is_stat_consumption_stage(cpi), cpi->oxcf.tune_cfg.tuning);
 
   const int new_qindex_rdmult = quant_params->base_qindex +
                                 x->rdmult_delta_qindex +
@@ -2114,7 +2120,7 @@
       new_qindex_rdmult, cm->seq_params->bit_depth,
       cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
       boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
-      is_stat_consumption_stage(cpi));
+      is_stat_consumption_stage(cpi), cpi->oxcf.tune_cfg.tuning);
 
   const double scaling_factor = (double)new_rdmult / (double)orig_rdmult;
 
diff --git a/av1/encoder/txb_rdopt.c b/av1/encoder/txb_rdopt.c
index 4fc4ba3..68e3c3c 100644
--- a/av1/encoder/txb_rdopt.c
+++ b/av1/encoder/txb_rdopt.c
@@ -335,12 +335,19 @@
   const LV_MAP_EOB_COST *txb_eob_costs =
       &coeff_costs->eob_costs[eob_multi_size][plane_type];
 
-  const int rshift = 2;
+  // For the SSIMULACRA 2 tune, increase rshift from 2 to 4.
+  // This biases trellis quantization towards keeping more coefficients, and
+  // together with the SSIMULACRA2 rdmult adjustment in
+  // av1_compute_rd_mult_based_on_qindex(), this helps preserve image
+  // features (like repeating patterns and camera noise/film grain), which
+  // improves SSIMULACRA 2 scores.
+  const int rshift = cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIMULACRA2 ? 4 : 2;
+  const int rounding = (1 << rshift) >> 1;
 
   const int64_t rdmult =
       (((int64_t)x->rdmult *
         (plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8)))) +
-       2) >>
+       rounding) >>
       rshift;
 
   uint8_t levels_buf[TX_PAD_2D];