rtc: Cleanup/fixes for aq-mode=3

Remove the last_coded_q, as this was not
currently being updated and hence not used.
This removes an allocation for 1 layer and
also for the multi-layer case.

Also skip looping over 4x4 blocks: the aq-mode=3
segment feature for RTC is only used for blocks >= 8X8,
so skipping over 4x4 block can gives a small speedup.
But there is a small bdrate loss (~0.2%), since for more
efficient coding of the seg map, the cur_frame->seg_map
needs to set at 4x4, along with the function
av1_cyclic_reset_segment_skip().

This was mentioned here:
https://aomedia-review.googlesource.com/c/aom/+/136503

Speedup is ~1.5% on vga, speed 10.

Change-Id: I15efe2909aab108678fa15f043fd1ab00f3eec8c
diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c
index 01afb99..452a66f 100644
--- a/av1/encoder/aq_cyclicrefresh.c
+++ b/av1/encoder/aq_cyclicrefresh.c
@@ -21,7 +21,6 @@
 #include "aom_dsp/aom_dsp_common.h"
 
 CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
-  size_t last_coded_q_map_size;
   CYCLIC_REFRESH *const cr = aom_calloc(1, sizeof(*cr));
   if (cr == NULL) return NULL;
 
@@ -30,21 +29,12 @@
     av1_cyclic_refresh_free(cr);
     return NULL;
   }
-  last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map);
-  cr->last_coded_q_map = aom_malloc(last_coded_q_map_size);
-  if (cr->last_coded_q_map == NULL) {
-    av1_cyclic_refresh_free(cr);
-    return NULL;
-  }
-  assert(MAXQ <= 255);
-  memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
   return cr;
 }
 
 void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
   if (cr != NULL) {
     aom_free(cr->map);
-    aom_free(cr->last_coded_q_map);
     aom_free(cr);
   }
 }
@@ -155,6 +145,7 @@
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
+  int sh = cpi->cyclic_refresh->skip_over4x4 ? 2 : 1;
   const int prev_segment_id = mbmi->segment_id;
   mbmi->segment_id = av1_get_spatial_seg_pred(cm, xd, &cdf_num);
   if (prev_segment_id != mbmi->segment_id) {
@@ -164,8 +155,8 @@
     const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw);
     const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh);
     const int block_index = mi_row * cm->mi_params.mi_cols + mi_col;
-    for (int mi_y = 0; mi_y < ymis; mi_y++) {
-      for (int mi_x = 0; mi_x < xmis; mi_x++) {
+    for (int mi_y = 0; mi_y < ymis; mi_y += sh) {
+      for (int mi_x = 0; mi_x < xmis; mi_x += sh) {
         const int map_offset =
             block_index + mi_y * cm->mi_params.mi_cols + mi_x;
         cr->map[map_offset] = 0;
@@ -200,6 +191,7 @@
   const int block_index = mi_row * cm->mi_params.mi_cols + mi_col;
   const int refresh_this_block =
       candidate_refresh_aq(cr, mbmi, rate, dist, bsize);
+  int sh = cpi->cyclic_refresh->skip_over4x4 ? 2 : 1;
   // Default is to not update the refresh map.
   int new_map_value = cr->map[block_index];
 
@@ -229,8 +221,8 @@
 
   // Update entries in the cyclic refresh map with new_map_value, and
   // copy mbmi->segment_id into global segmentation map.
-  for (int mi_y = 0; mi_y < ymis; mi_y++) {
-    for (int mi_x = 0; mi_x < xmis; mi_x++) {
+  for (int mi_y = 0; mi_y < ymis; mi_y += sh) {
+    for (int mi_x = 0; mi_x < xmis; mi_x += sh) {
       const int map_offset = block_index + mi_y * cm->mi_params.mi_cols + mi_x;
       cr->map[map_offset] = new_map_value;
       cpi->enc_seg.map[map_offset] = mbmi->segment_id;
@@ -342,13 +334,6 @@
     int sb_col_index = i - sb_row_index * sb_cols;
     int mi_row = sb_row_index * cm->seq_params->mib_size;
     int mi_col = sb_col_index * cm->seq_params->mib_size;
-    // TODO(any): Ensure the population of
-    // cpi->common.features.allow_screen_content_tools and use the same instead
-    // of cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN
-    int qindex_thresh = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN
-                            ? av1_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2,
-                                             cm->quant_params.base_qindex)
-                            : 0;
     assert(mi_row >= 0 && mi_row < mi_params->mi_rows);
     assert(mi_col >= 0 && mi_col < mi_params->mi_cols);
     bl_index = mi_row * mi_params->mi_cols + mi_col;
@@ -363,7 +348,7 @@
         // for possible boost/refresh (segment 1). The segment id may get
         // reset to 0 later if block gets coded anything other than GLOBALMV.
         if (cr->map[bl_index2] == 0) {
-          if (cr->last_coded_q_map[bl_index2] > qindex_thresh) sum_map += 4;
+          sum_map += 4;
         } else if (cr->map[bl_index2] < 0) {
           cr->map[bl_index2]++;
         }
@@ -399,6 +384,15 @@
   double weight_segment = 0;
   int qp_thresh = AOMMIN(20, rc->best_quality << 1);
   int qp_max_thresh = 118 * MAXQ >> 7;
+  // Although this segment feature for RTC is only used for
+  // blocks >= 8X8, for more efficient coding of the seg map
+  // cur_frame->seg_map needs to set at 4x4 along with the
+  // function av1_cyclic_reset_segment_skip(). Skipping over
+  // 4x4 will therefore have small bdrate loss (~0.2%), so
+  // we use it only for speed > 9 for now.
+  // Also if loop-filter deltas is applied via segment, then
+  // we need to set cr->skip_over4x4 = 1.
+  cr->skip_over4x4 = (cpi->oxcf.speed > 9) ? 1 : 0;
   cr->apply_cyclic_refresh = 1;
   if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf.rc_cfg) ||
       cpi->svc.temporal_layer_id > 0 ||
@@ -481,9 +475,6 @@
     memset(seg_map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
     av1_disable_segmentation(&cm->seg);
     if (cm->current_frame.frame_type == KEY_FRAME) {
-      memset(cr->last_coded_q_map, MAXQ,
-             cm->mi_params.mi_rows * cm->mi_params.mi_cols *
-                 sizeof(*cr->last_coded_q_map));
       cr->sb_index = 0;
     }
     return;
diff --git a/av1/encoder/aq_cyclicrefresh.h b/av1/encoder/aq_cyclicrefresh.h
index 1c0d5cb..4e4e1f2 100644
--- a/av1/encoder/aq_cyclicrefresh.h
+++ b/av1/encoder/aq_cyclicrefresh.h
@@ -80,10 +80,6 @@
    */
   int8_t *map;
   /*!
-   * Map of the last q a block was coded at.
-   */
-  uint8_t *last_coded_q_map;
-  /*!
    * Threshold applied to the projected rate of the coding block,
    * when deciding whether block should be refreshed.
    */
@@ -111,6 +107,7 @@
   int qindex_delta[3];
   double weight_segment;
   int apply_cyclic_refresh;
+  int skip_over4x4;
   /*!\endcond */
 };
 
diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index 8c5fa63..1bc6b32 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c
@@ -2083,7 +2083,8 @@
     }
     if (tile_data->allow_update_cdf) update_stats(&cpi->common, td);
   }
-  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && mbmi->skip_txfm)
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && mbmi->skip_txfm &&
+      !cpi->cyclic_refresh->skip_over4x4)
     av1_cyclic_reset_segment_skip(cpi, x, mi_row, mi_col, bsize);
   // TODO(Ravi/Remya): Move this copy function to a better logical place
   // This function will copy the best mode information from block
diff --git a/av1/encoder/svc_layercontext.c b/av1/encoder/svc_layercontext.c
index 67b3082..4e48218 100644
--- a/av1/encoder/svc_layercontext.c
+++ b/av1/encoder/svc_layercontext.c
@@ -62,7 +62,6 @@
       // (i.e., ss_number_layers > 1), these need to be updated per spatial
       // layer. Cyclic refresh is only applied on base temporal layer.
       if (svc->number_spatial_layers > 1 && tl == 0) {
-        size_t last_coded_q_map_size;
         lc->sb_index = 0;
         lc->actual_num_seg1_blocks = 0;
         lc->actual_num_seg2_blocks = 0;
@@ -71,13 +70,6 @@
         CHECK_MEM_ERROR(cm, lc->map,
                         aom_malloc(mi_rows * mi_cols * sizeof(*lc->map)));
         memset(lc->map, 0, mi_rows * mi_cols);
-        last_coded_q_map_size =
-            mi_rows * mi_cols * sizeof(*lc->last_coded_q_map);
-        if (lc->last_coded_q_map) aom_free(lc->last_coded_q_map);
-        CHECK_MEM_ERROR(cm, lc->last_coded_q_map,
-                        aom_malloc(last_coded_q_map_size));
-        assert(MAXQ <= 255);
-        memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
       }
     }
     svc->downsample_filter_type[sl] = BILINEAR;
@@ -195,7 +187,6 @@
       svc->number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
     CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
     swap_ptr(&cr->map, &lc->map);
-    swap_ptr(&cr->last_coded_q_map, &lc->last_coded_q_map);
     cr->sb_index = lc->sb_index;
     cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks;
     cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks;
@@ -234,11 +225,8 @@
       cpi->svc.number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
     CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
     signed char *temp = lc->map;
-    uint8_t *temp2 = lc->last_coded_q_map;
     lc->map = cr->map;
     cr->map = temp;
-    lc->last_coded_q_map = cr->last_coded_q_map;
-    cr->last_coded_q_map = temp2;
     lc->sb_index = cr->sb_index;
     lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks;
     lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks;
@@ -301,7 +289,6 @@
       int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
       LAYER_CONTEXT *const lc = &svc->layer_context[layer];
       if (lc->map) aom_free(lc->map);
-      if (lc->last_coded_q_map) aom_free(lc->last_coded_q_map);
     }
   }
 }
diff --git a/av1/encoder/svc_layercontext.h b/av1/encoder/svc_layercontext.h
index a4fe654..310d08a 100644
--- a/av1/encoder/svc_layercontext.h
+++ b/av1/encoder/svc_layercontext.h
@@ -49,11 +49,6 @@
    */
   int8_t *map;
   /*!
-   * Segmentation map for last coded quantization paramters.
-   */
-  uint8_t *last_coded_q_map;
-
-  /*!
    * Number of blocks on segment 1
    */
   int actual_num_seg1_blocks;
diff --git a/test/rt_end_to_end_test.cc b/test/rt_end_to_end_test.cc
index bf8443c..5e360f2 100644
--- a/test/rt_end_to_end_test.cc
+++ b/test/rt_end_to_end_test.cc
@@ -37,21 +37,21 @@
                            { 7, { { 0, 34.9 }, { 3, 35.8 } } },
                            { 8, { { 0, 35.0 }, { 3, 35.8 } } },
                            { 9, { { 0, 34.9 }, { 3, 35.5 } } },
-                           { 10, { { 0, 34.8 }, { 3, 35.4 } } } } },
+                           { 10, { { 0, 34.7 }, { 3, 35.3 } } } } },
                        { "paris_352_288_30.y4m",
                          { { 5, { { 0, 36.2 }, { 3, 36.7 } } },
                            { 6, { { 0, 36.1 }, { 3, 36.5 } } },
                            { 7, { { 0, 35.5 }, { 3, 36.0 } } },
                            { 8, { { 0, 36.0 }, { 3, 36.5 } } },
                            { 9, { { 0, 35.5 }, { 3, 36.0 } } },
-                           { 10, { { 0, 35.4 }, { 3, 36.0 } } } } },
+                           { 10, { { 0, 35.3 }, { 3, 35.9 } } } } },
                        { "niklas_1280_720_30.y4m",
                          { { 5, { { 0, 34.4 }, { 3, 34.30 } } },
                            { 6, { { 0, 34.2 }, { 3, 34.2 } } },
                            { 7, { { 0, 33.6 }, { 3, 33.6 } } },
                            { 8, { { 0, 33.48 }, { 3, 33.48 } } },
                            { 9, { { 0, 33.4 }, { 3, 33.4 } } },
-                           { 10, { { 0, 33.3 }, { 3, 33.3 } } } } } };
+                           { 10, { { 0, 33.2 }, { 3, 33.2 } } } } } };
 
 typedef struct {
   const char *filename;