Support parsing multi-layer gf group and 7 ref frame in tpl tool

- Use precomputed reference frame map in gf_group for calculating
  temporal dependency.
- Expand the length of tpl stats buffer from MAX_LAG_BUFFERS to
  MAX_STATIC_GF_GROUP_LENGTH + 3, to include maximum numbers of gf
  pictures + the key/golden frame + 2 extended frames in the next
  gop. Otherwise the encoder fails due to memory issues.
- Only enable rdmult modulation on ARF frames.

Change-Id: Ibfea1086445dc948fda4b9bb14024101036fb9e0
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 926e30a..71e8f6a 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -4131,9 +4131,12 @@
                                     int mi_col) {
   const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size;
   const int orig_rdmult = cpi->rd.RDMULT;
+  const int gf_group_index = cpi->twopass.gf_group.index;
   x->cb_rdmult = orig_rdmult;
-  if (cpi->twopass.gf_group.index > 0 && cpi->oxcf.enable_tpl_model &&
-      cpi->oxcf.aq_mode == NO_AQ && cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
+
+  if (cpi->oxcf.enable_tpl_model && cpi->oxcf.aq_mode == NO_AQ &&
+      cpi->oxcf.deltaq_mode == NO_DELTA_Q && gf_group_index > 0 &&
+      cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE) {
     const int dr = get_rdmult_delta(cpi, sb_size, mi_row, mi_col, orig_rdmult);
     x->rdmult = dr;
     x->cb_rdmult = x->rdmult;
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 2f2eebc..c644ec3 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -2711,7 +2711,7 @@
   av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
   av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
 
-  for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
+  for (int frame = 0; frame < MAX_LENGTH_TPL_FRAME_STATS; ++frame) {
     int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
     int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
 
@@ -3055,7 +3055,7 @@
 #endif
   }
 
-  for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
+  for (int frame = 0; frame < MAX_LENGTH_TPL_FRAME_STATS; ++frame) {
     aom_free(cpi->tpl_stats[frame].tpl_stats_ptr);
     cpi->tpl_stats[frame].is_valid = 0;
   }
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index a9e1379..b1cf0ea 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -157,6 +157,8 @@
   SS_CFG_TOTAL = 2
 } UENUM1BYTE(SS_CFG_OFFSET);
 
+#define MAX_LENGTH_TPL_FRAME_STATS MAX_STATIC_GF_GROUP_LENGTH + 3
+
 typedef struct TplDepStats {
   int64_t intra_cost;
   int64_t inter_cost;
@@ -741,7 +743,7 @@
   YV12_BUFFER_CONFIG *unscaled_last_source;
   YV12_BUFFER_CONFIG scaled_last_source;
 
-  TplDepFrame tpl_stats[MAX_LAG_BUFFERS];
+  TplDepFrame tpl_stats[MAX_LENGTH_TPL_FRAME_STATS];
   YV12_BUFFER_CONFIG *tpl_recon_frames[INTER_REFS_PER_FRAME + 1];
 
   // For a still frame, this flag is set to 1 to skip partition search.
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 4b2075c..9f5dec4 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -24,7 +24,7 @@
 
 typedef struct GF_PICTURE {
   YV12_BUFFER_CONFIG *frame;
-  int ref_frame[7];
+  int ref_frame[INTER_REFS_PER_FRAME];
 } GF_PICTURE;
 
 static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
@@ -459,6 +459,12 @@
   }
 }
 
+#define REF_IDX(ref) ((ref)-LAST_FRAME)
+
+static INLINE void init_ref_frame_array(GF_PICTURE *gf_picture) {
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) gf_picture->ref_frame[i] = -1;
+}
+
 static void init_gop_frames_for_tpl(AV1_COMP *cpi, GF_PICTURE *gf_picture,
                                     const GF_GROUP *gf_group,
                                     int *tpl_group_frames,
@@ -466,18 +472,15 @@
   AV1_COMMON *cm = &cpi->common;
   const SequenceHeader *const seq_params = &cm->seq_params;
   int frame_idx = 0;
-  int i;
-  int gld_index = -1;
-  int lst_index = -1;
-  int extend_frame_count = 0;
-  int frame_gop_offset = 0;
+  int frame_disp_idx = 0;
   int pframe_qindex = cpi->tpl_stats[2].base_qindex;
 
   RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
   int recon_frame_index[INTER_REFS_PER_FRAME + 1] = { -1, -1, -1, -1,
                                                       -1, -1, -1, -1 };
 
-  for (i = 0; i < FRAME_BUFFERS && frame_idx < INTER_REFS_PER_FRAME + 1; ++i) {
+  for (int i = 0; i < FRAME_BUFFERS && frame_idx < INTER_REFS_PER_FRAME + 1;
+       ++i) {
     if (frame_bufs[i].ref_count == 0) {
       alloc_frame_mvs(cm, &frame_bufs[i]);
       if (aom_realloc_frame_buffer(
@@ -493,7 +496,7 @@
     }
   }
 
-  for (i = 0; i < INTER_REFS_PER_FRAME + 1; ++i) {
+  for (int i = 0; i < INTER_REFS_PER_FRAME + 1; ++i) {
     assert(recon_frame_index[i] >= 0);
     cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
   }
@@ -503,65 +506,68 @@
   // Initialize Golden reference frame.
   RefCntBuffer *ref_buf = get_ref_frame_buf(cm, GOLDEN_FRAME);
   gf_picture[0].frame = &ref_buf->buf;
-  for (i = 0; i < 7; ++i) gf_picture[0].ref_frame[i] = -1;
-  gld_index = 0;
+  init_ref_frame_array(&gf_picture[0]);
   ++*tpl_group_frames;
 
-  // Initialize base layer ARF frame
-  gf_picture[1].frame = frame_input->source;
-  gf_picture[1].ref_frame[0] = gld_index;
-  // TODO(yuec) Need o  figure out full AV1 reference model
-  for (i = 1; i < 7; ++i) gf_picture[1].ref_frame[i] = -1;
-  ++*tpl_group_frames;
+  // Initialize frames in the GF group
+  for (frame_idx = 1; frame_idx <= gf_group->size; ++frame_idx) {
+    if (frame_idx == 1) {
+      gf_picture[1].frame = frame_input->source;
+    } else {
+      frame_disp_idx = gf_group->frame_disp_idx[frame_idx];
+      struct lookahead_entry *buf =
+          av1_lookahead_peek(cpi->lookahead, frame_disp_idx - 1);
 
-  // Initialize P frames
-  for (frame_idx = 2; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
-    struct lookahead_entry *buf;
-    frame_gop_offset = gf_group->frame_disp_idx[frame_idx];
-    buf = av1_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
+      if (buf == NULL) break;
 
-    if (buf == NULL) break;
+      gf_picture[frame_idx].frame = &buf->img;
+    }
 
-    gf_picture[frame_idx].frame = &buf->img;
-    gf_picture[frame_idx].ref_frame[0] = gld_index;
-    gf_picture[frame_idx].ref_frame[1] = lst_index;
-    gf_picture[frame_idx].ref_frame[2] = 1;
-    for (i = 3; i < 7; ++i) gf_picture[frame_idx].ref_frame[i] = -1;
+    memcpy(gf_picture[frame_idx].ref_frame,
+           gf_group->ref_frame_gop_idx[frame_idx],
+           sizeof(gf_picture[0].ref_frame[0]) * INTER_REFS_PER_FRAME);
 
     ++*tpl_group_frames;
-    lst_index = frame_idx;
-
-    if (frame_idx == gf_group->size) break;
   }
 
-  gld_index = frame_idx;
-  lst_index = AOMMAX(0, frame_idx - 1);
-  ++frame_idx;
-  ++frame_gop_offset;
+  ++frame_disp_idx;
+  int extend_frame_count = 0;
+  const int gld_idx_next_gop = gf_group->size;
+  const int lst_idx_next_gop =
+      gf_picture[gld_idx_next_gop].ref_frame[REF_IDX(LAST_FRAME)];
+  const int lst2_idx_next_gop =
+      gf_picture[gld_idx_next_gop].ref_frame[REF_IDX(LAST2_FRAME)];
+  const int lst3_idx_next_gop =
+      gf_picture[gld_idx_next_gop].ref_frame[REF_IDX(LAST3_FRAME)];
 
   // Extend two frames outside the current gf group.
-  for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) {
+  for (; frame_idx < MAX_LENGTH_TPL_FRAME_STATS && extend_frame_count < 2;
+       ++frame_idx) {
     struct lookahead_entry *buf =
-        av1_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
+        av1_lookahead_peek(cpi->lookahead, frame_disp_idx - 1);
 
     if (buf == NULL) break;
 
     cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex;
 
     gf_picture[frame_idx].frame = &buf->img;
-    gf_picture[frame_idx].ref_frame[0] = gld_index;
-    gf_picture[frame_idx].ref_frame[1] = lst_index;
-    for (i = 2; i < 7; ++i) gf_picture[frame_idx].ref_frame[i] = -1;
-    lst_index = frame_idx;
+
+    init_ref_frame_array(&gf_picture[frame_idx]);
+
+    gf_picture[frame_idx].ref_frame[REF_IDX(GOLDEN_FRAME)] = gld_idx_next_gop;
+    gf_picture[frame_idx].ref_frame[REF_IDX(LAST_FRAME)] = lst_idx_next_gop;
+    gf_picture[frame_idx].ref_frame[REF_IDX(LAST2_FRAME)] = lst2_idx_next_gop;
+    gf_picture[frame_idx].ref_frame[REF_IDX(LAST3_FRAME)] = lst3_idx_next_gop;
+
     ++*tpl_group_frames;
     ++extend_frame_count;
-    ++frame_gop_offset;
+    ++frame_disp_idx;
   }
 }
 
 static void init_tpl_stats(AV1_COMP *cpi) {
   int frame_idx;
-  for (frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
+  for (frame_idx = 0; frame_idx < MAX_LENGTH_TPL_FRAME_STATS; ++frame_idx) {
     TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
     memset(tpl_frame->tpl_stats_ptr, 0,
            tpl_frame->height * tpl_frame->width *
@@ -572,7 +578,7 @@
 
 void av1_tpl_setup_stats(AV1_COMP *cpi,
                          const EncodeFrameInput *const frame_input) {
-  GF_PICTURE gf_picture[MAX_LAG_BUFFERS];
+  GF_PICTURE gf_picture[MAX_LENGTH_TPL_FRAME_STATS];
   const GF_GROUP *gf_group = &cpi->twopass.gf_group;
   int tpl_group_frames = 0;
   int frame_idx;