Do MV scaling on the fly for memory and run time reduction

This change is not normative and produces the same results as before.
TPL_MV_REF data structure is about 5x smaller.
Observed overall decoder run time reduction is about 4%.
No observed change in encoder run time.

Change-Id: Id68a492bac3bf28f48b7ceeedf85cd29981238ee
diff --git a/av1/common/mvref_common.c b/av1/common/mvref_common.c
index 47f6bff..e6bb7da 100644
--- a/av1/common/mvref_common.c
+++ b/av1/common/mvref_common.c
@@ -14,6 +14,27 @@
 
 #define USE_CUR_GM_REFMV 1
 
+#if CONFIG_MFMV
+// Although we assign 32 bit integers, all the values are strictly under 14
+// bits.
+static int div_mult[64] = {
+  0,    16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638, 1489, 1365,
+  1260, 1170,  1092, 1024, 963,  910,  862,  819,  780,  744,  712,  682,  655,
+  630,  606,   585,  564,  546,  528,  512,  496,  481,  468,  455,  442,  431,
+  420,  409,   399,  390,  381,  372,  364,  356,  348,  341,  334,  327,  321,
+  315,  309,   303,  297,  292,  287,  282,  277,  273,  268,  264,  260,
+};
+
+// TODO(jingning): Consider the use of lookup table for (num / den)
+// altogether.
+static void get_mv_projection(MV *output, MV ref, int num, int den) {
+  output->row =
+      (int16_t)(ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14));
+  output->col =
+      (int16_t)(ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14));
+}
+#endif  // CONFIG_MFMV
+
 void av1_copy_frame_mvs(const AV1_COMMON *const cm, MODE_INFO *mi, int mi_row,
                         int mi_col, int x_mis, int y_mis) {
 #if CONFIG_TMV || CONFIG_MFMV
@@ -455,10 +476,17 @@
   av1_set_ref_frame(rf, ref_frame);
 
   if (rf[1] == NONE_FRAME) {
+    int cur_frame_index = cm->cur_frame->cur_frame_offset;
+    int buf_idx_0 = cm->frame_refs[FWD_RF_OFFSET(rf[0])].idx;
+    int cur_offset_0 = cur_frame_index -
+                       cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset;
+
     for (int i = 0; i < MFMV_STACK_SIZE; ++i) {
-      if (prev_frame_mvs->mfmv[ref_frame - LAST_FRAME][i].as_int !=
-          INVALID_MV) {
-        int_mv this_refmv = prev_frame_mvs->mfmv[ref_frame - LAST_FRAME][i];
+      if (prev_frame_mvs->mfmv0[i].as_int != INVALID_MV) {
+        int_mv this_refmv;
+
+        get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0[i].as_mv,
+                          cur_offset_0, prev_frame_mvs->ref_frame_offset[i]);
 #if CONFIG_AMVR
         lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv,
                            cm->cur_frame_force_integer_mv);
@@ -494,11 +522,23 @@
     }
   } else {
     // Process compound inter mode
+    int cur_frame_index = cm->cur_frame->cur_frame_offset;
+    int buf_idx_0 = cm->frame_refs[FWD_RF_OFFSET(rf[0])].idx;
+    int cur_offset_0 = cur_frame_index -
+                       cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset;
+    int buf_idx_1 = cm->frame_refs[FWD_RF_OFFSET(rf[1])].idx;
+    int cur_offset_1 = cur_frame_index -
+                       cm->buffer_pool->frame_bufs[buf_idx_1].cur_frame_offset;
+
     for (int i = 0; i < MFMV_STACK_SIZE; ++i) {
-      if (prev_frame_mvs->mfmv[rf[0] - LAST_FRAME][i].as_int != INVALID_MV &&
-          prev_frame_mvs->mfmv[rf[1] - LAST_FRAME][i].as_int != INVALID_MV) {
-        int_mv this_refmv = prev_frame_mvs->mfmv[rf[0] - LAST_FRAME][i];
-        int_mv comp_refmv = prev_frame_mvs->mfmv[rf[1] - LAST_FRAME][i];
+      if (prev_frame_mvs->mfmv0[i].as_int != INVALID_MV) {
+        int_mv this_refmv;
+        int_mv comp_refmv;
+        get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0[i].as_mv,
+                          cur_offset_0, prev_frame_mvs->ref_frame_offset[i]);
+        get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0[i].as_mv,
+                          cur_offset_1, prev_frame_mvs->ref_frame_offset[i]);
+
 #if CONFIG_AMVR
         lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv,
                            cm->cur_frame_force_integer_mv);
@@ -1361,27 +1401,6 @@
 #endif  // CONFIG_FRAME_MARKER
 
 #if CONFIG_MFMV
-// Although we assign 32 bit integers, all the values are strictly under 14
-// bits.
-static int div_mult[64] = {
-  0,    16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638, 1489, 1365,
-  1260, 1170,  1092, 1024, 963,  910,  862,  819,  780,  744,  712,  682,  655,
-  630,  606,   585,  564,  546,  528,  512,  496,  481,  468,  455,  442,  431,
-  420,  409,   399,  390,  381,  372,  364,  356,  348,  341,  334,  327,  321,
-  315,  309,   303,  297,  292,  287,  282,  277,  273,  268,  264,  260,
-};
-
-// TODO(jingning): Consider the use of lookup table for (num / den)
-// altogether.
-static void get_mv_projection(MV *output, MV ref, int num, int den) {
-  output->row =
-      (int16_t)(ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14));
-  output->col =
-      (int16_t)(ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14));
-}
-#endif  // CONFIG_MFMV
-
-#if CONFIG_MFMV
 #define MAX_OFFSET_WIDTH 64
 #define MAX_OFFSET_HEIGHT 0
 
@@ -1490,12 +1509,12 @@
         if (pos_valid) {
           int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c;
 
-          for (MV_REFERENCE_FRAME rf = ALTREF_FRAME; rf >= LAST_FRAME; --rf) {
-            get_mv_projection(&this_mv.as_mv, fwd_mv, cur_offset[rf],
-                              ref_frame_offset);
-            tpl_mvs_base[mi_offset].mfmv[FWD_RF_OFFSET(rf)][ref_stamp].as_int =
-                this_mv.as_int;
-          }
+          tpl_mvs_base[mi_offset].mfmv0[ref_stamp].as_mv.row =
+              (dir == 1) ? -fwd_mv.row : fwd_mv.row;
+          tpl_mvs_base[mi_offset].mfmv0[ref_stamp].as_mv.col =
+              (dir == 1) ? -fwd_mv.col : fwd_mv.col;
+          tpl_mvs_base[mi_offset].ref_frame_offset[ref_stamp] =
+              ref_frame_offset;
         }
       }
     }
@@ -1510,11 +1529,11 @@
   int bwd_frame_index = 0, alt2_frame_index = 0;
 
   TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
-  for (int ref_frame = 0; ref_frame < INTER_REFS_PER_FRAME; ++ref_frame) {
-    int size = ((cm->mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_stride >> 1);
-    for (int idx = 0; idx < size; ++idx) {
-      for (int i = 0; i < MFMV_STACK_SIZE; ++i)
-        tpl_mvs_base[idx].mfmv[ref_frame][i].as_int = INVALID_MV;
+  int size = ((cm->mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_stride >> 1);
+  for (int idx = 0; idx < size; ++idx) {
+    for (int i = 0; i < MFMV_STACK_SIZE; ++i) {
+      tpl_mvs_base[idx].mfmv0[i].as_int = INVALID_MV;
+      tpl_mvs_base[idx].ref_frame_offset[i] = 0;
     }
   }
 
diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index 5c12805..88201a5 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h
@@ -120,8 +120,8 @@
 #define MFMV_STACK_SIZE 3
 
 typedef struct {
-  int_mv mfmv[INTER_REFS_PER_FRAME][MFMV_STACK_SIZE];
-  int mem_size;
+  int_mv mfmv0[MFMV_STACK_SIZE];
+  uint8_t ref_frame_offset[MFMV_STACK_SIZE];
 } TPL_MV_REF;
 #endif
 
@@ -590,6 +590,7 @@
 #endif
 #if CONFIG_MFMV
   TPL_MV_REF *tpl_mvs;
+  int tpl_mvs_mem_size;
   // TODO(jingning): This can be combined with sign_bias later.
   int8_t ref_frame_side[TOTAL_REFS_PER_FRAME];
 #endif
@@ -720,13 +721,13 @@
   const int mem_size =
       ((cm->mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_stride >> 1);
   int realloc = cm->tpl_mvs == NULL;
-  if (cm->tpl_mvs) realloc |= cm->tpl_mvs->mem_size < mem_size;
+  if (cm->tpl_mvs) realloc |= cm->tpl_mvs_mem_size < mem_size;
 
   if (realloc) {
     aom_free(cm->tpl_mvs);
     CHECK_MEM_ERROR(cm, cm->tpl_mvs,
                     (TPL_MV_REF *)aom_calloc(mem_size, sizeof(*cm->tpl_mvs)));
-    cm->tpl_mvs->mem_size = mem_size;
+    cm->tpl_mvs_mem_size = mem_size;
   }
 #endif
 }