diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index c279f60..e4db615 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -2958,7 +2958,7 @@
 
   // Shown keyframes and switch-frames automatically refreshes all reference
   // frames.  For all other frame types, we need to write refresh_frame_flags.
-  if ((current_frame->frame_type == KEY_FRAME && cpi->no_show_fwd_kf) ||
+  if ((current_frame->frame_type == KEY_FRAME && !cm->show_frame) ||
       current_frame->frame_type == INTER_FRAME ||
       current_frame->frame_type == INTRA_ONLY_FRAME)
     aom_wb_write_literal(wb, current_frame->refresh_frame_flags, REF_FRAMES);
diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c
index 6c4a3ce..0966093 100644
--- a/av1/encoder/encode_strategy.c
+++ b/av1/encoder/encode_strategy.c
@@ -46,7 +46,8 @@
 
 void av1_configure_buffer_updates(
     AV1_COMP *const cpi, RefreshFrameFlagsInfo *const refresh_frame_flags,
-    const FRAME_UPDATE_TYPE type, int force_refresh_all) {
+    const FRAME_UPDATE_TYPE type, const FRAME_TYPE frame_type,
+    int force_refresh_all) {
   // NOTE(weitinglin): Should we define another function to take care of
   // cpi->rc.is_$Source_Type to make this function as it is in the comment?
 
@@ -74,7 +75,13 @@
 
     case ARF_UPDATE:
       // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
-      set_refresh_frame_flags(refresh_frame_flags, false, false, true);
+      if (frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) {
+        // TODO(bohanli): consider moving this to force_refresh_all?
+        // This is Keyframe as arf
+        set_refresh_frame_flags(refresh_frame_flags, true, true, true);
+      } else {
+        set_refresh_frame_flags(refresh_frame_flags, false, false, true);
+      }
       break;
 
     case INTNL_OVERLAY_UPDATE:
@@ -572,7 +579,8 @@
 // Update reference frame stack info.
 void av1_update_ref_frame_map(AV1_COMP *cpi,
                               FRAME_UPDATE_TYPE frame_update_type,
-                              int show_existing_frame, int ref_map_index,
+                              FRAME_TYPE frame_type, int show_existing_frame,
+                              int ref_map_index,
                               RefBufferStack *ref_buffer_stack) {
   AV1_COMMON *const cm = &cpi->common;
   // TODO(jingning): Consider the S-frame same as key frame for the
@@ -612,7 +620,16 @@
       break;
     case ARF_UPDATE:
     case INTNL_ARF_UPDATE:
-      update_arf_stack(ref_map_index, ref_buffer_stack);
+      if (frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) {
+        stack_reset(ref_buffer_stack->lst_stack,
+                    &ref_buffer_stack->lst_stack_size);
+        stack_reset(ref_buffer_stack->gld_stack,
+                    &ref_buffer_stack->gld_stack_size);
+        stack_reset(ref_buffer_stack->arf_stack,
+                    &ref_buffer_stack->arf_stack_size);
+      } else {
+        update_arf_stack(ref_map_index, ref_buffer_stack);
+      }
       stack_push(ref_buffer_stack->arf_stack, &ref_buffer_stack->arf_stack_size,
                  ref_map_index);
       break;
@@ -1024,10 +1041,13 @@
           find_unused_ref_frame(remapped_ref_idx, lst_stack, lst_stack_size);
     }
 
-    if (ref_map_index != INVALID_IDX)
+    if (ref_map_index != INVALID_IDX) {
       remapped_ref_idx[idx] = ref_map_index;
-    else
+    } else if (!gld_stack_size && arf_stack_size) {
+      remapped_ref_idx[idx] = ref_buffer_stack->arf_stack[0];
+    } else {
       remapped_ref_idx[idx] = ref_buffer_stack->gld_stack[0];
+    }
   }
 }
 
@@ -1161,9 +1181,9 @@
   // TODO(david.turner@argondesign.com): Change all the encode strategy to
   // modify frame_params instead of cm or cpi.
 
-  // Per-frame encode speed.  In theory this can vary, but things may have been
-  // written assuming speed-level will not change within a sequence, so this
-  // parameter should be used with caution.
+  // Per-frame encode speed.  In theory this can vary, but things may have
+  // been written assuming speed-level will not change within a sequence, so
+  // this parameter should be used with caution.
   frame_params.speed = oxcf->speed;
 
   // Work out some encoding parameters specific to the pass:
@@ -1199,7 +1219,8 @@
       !frame_params.show_existing_frame;
 
   av1_configure_buffer_updates(cpi, &frame_params.refresh_frame,
-                               frame_update_type, force_refresh_all);
+                               frame_update_type, frame_params.frame_type,
+                               force_refresh_all);
 
   if (!is_stat_generation_stage(cpi)) {
     const RefCntBuffer *ref_frames[INTER_REFS_PER_FRAME];
@@ -1217,9 +1238,18 @@
       ref_frames[i] = get_ref_frame_buf(cm, ref_frame_priority_order[i]);
       ref_frame_buf[i] = ref_frames[i] != NULL ? &ref_frames[i]->buf : NULL;
     }
+
     // Work out which reference frame slots may be used.
-    frame_params.ref_frame_flags = get_ref_frame_flags(
-        &cpi->sf, ref_frame_buf, ext_flags->ref_frame_flags);
+    if (av1_check_keyframe_overlay(gf_group->index, gf_group,
+                                   cpi->rc.frames_since_key)) {
+      // This is a KF overlay, it should refer to arf. However KF overlay
+      // has the same LAST and ALTREF references, so ALTREF will be disabled
+      // in function get_ref_frame_flags. Therefore setting it manually.
+      frame_params.ref_frame_flags = av1_ref_frame_flag_list[ALTREF_FRAME];
+    } else {
+      frame_params.ref_frame_flags = get_ref_frame_flags(
+          &cpi->sf, ref_frame_buf, ext_flags->ref_frame_flags);
+    }
 
     frame_params.primary_ref_frame =
         choose_primary_ref_frame(cpi, &frame_params);
@@ -1238,9 +1268,9 @@
 
   // The way frame_params->remapped_ref_idx is setup is a placeholder.
   // Currently, reference buffer assignment is done by update_ref_frame_map()
-  // which is called by high-level strategy AFTER encoding a frame.  It modifies
-  // cm->remapped_ref_idx.  If you want to use an alternative method to
-  // determine reference buffer assignment, just put your assignments into
+  // which is called by high-level strategy AFTER encoding a frame.  It
+  // modifies cm->remapped_ref_idx.  If you want to use an alternative method
+  // to determine reference buffer assignment, just put your assignments into
   // frame_params->remapped_ref_idx here and they will be used when encoding
   // this frame.  If frame_params->remapped_ref_idx is setup independently of
   // cm->remapped_ref_idx then update_ref_frame_map() will have no effect.
@@ -1277,8 +1307,9 @@
     if (!ext_flags->refresh_frame.update_pending) {
       int ref_map_index =
           av1_get_refresh_ref_frame_map(cm->current_frame.refresh_frame_flags);
-      av1_update_ref_frame_map(cpi, frame_update_type, cm->show_existing_frame,
-                               ref_map_index, &cpi->ref_buffer_stack);
+      av1_update_ref_frame_map(cpi, frame_update_type, frame_params.frame_type,
+                               cm->show_existing_frame, ref_map_index,
+                               &cpi->ref_buffer_stack);
     }
   }
 
@@ -1314,3 +1345,25 @@
 
   return AOM_CODEC_OK;
 }
+
+// Determine whether a frame is a keyframe arf. Will return 0 for fwd kf arf.
+// Note it depends on frame_since_key and gf_group, therefore should be called
+// after the gf group is defined, or otherwise a keyframe arf may still return
+// 0.
+int av1_check_keyframe_arf(int gf_index, GF_GROUP *gf_group,
+                           int frame_since_key) {
+  if (gf_index >= gf_group->size) return 0;
+  return gf_group->update_type[gf_index] == ARF_UPDATE &&
+         gf_group->update_type[gf_index + 1] == OVERLAY_UPDATE &&
+         frame_since_key == 0;
+}
+
+// Determine whether a frame is a keyframe overlay (will also return 0 for fwd
+// kf overlays).
+int av1_check_keyframe_overlay(int gf_index, GF_GROUP *gf_group,
+                               int frame_since_key) {
+  if (gf_index < 1) return 0;
+  return gf_group->update_type[gf_index - 1] == ARF_UPDATE &&
+         gf_group->update_type[gf_index] == OVERLAY_UPDATE &&
+         frame_since_key == 0;
+}
diff --git a/av1/encoder/encode_strategy.h b/av1/encoder/encode_strategy.h
index 9d5c3ce..4bafb0a 100644
--- a/av1/encoder/encode_strategy.h
+++ b/av1/encoder/encode_strategy.h
@@ -63,7 +63,8 @@
 // refresh_*_frame flags to be set, because we refresh all buffers in this case.
 void av1_configure_buffer_updates(
     AV1_COMP *const cpi, RefreshFrameFlagsInfo *const refresh_frame_flags,
-    const FRAME_UPDATE_TYPE type, int force_refresh_all);
+    const FRAME_UPDATE_TYPE type, const FRAME_TYPE frame_type,
+    int force_refresh_all);
 
 int av1_get_refresh_frame_flags(const AV1_COMP *const cpi,
                                 const EncodeFrameParams *const frame_params,
@@ -74,7 +75,8 @@
 
 void av1_update_ref_frame_map(AV1_COMP *cpi,
                               FRAME_UPDATE_TYPE frame_update_type,
-                              int show_existing_frame, int ref_map_index,
+                              FRAME_TYPE frame_type, int show_existing_frame,
+                              int ref_map_index,
                               RefBufferStack *ref_buffer_stack);
 
 void av1_get_ref_frames(AV1_COMP *const cpi, RefBufferStack *ref_buffer_stack);
@@ -82,6 +84,11 @@
 int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
                                const int up_to_index,
                                const COMPRESSOR_STAGE compressor_stage);
+
+int av1_check_keyframe_arf(int gf_index, GF_GROUP *gf_group,
+                           int frame_since_key);
+int av1_check_keyframe_overlay(int gf_index, GF_GROUP *gf_group,
+                               int frame_since_key);
 /*!\endcond */
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index c5bebc6..b829e55 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -2813,7 +2813,12 @@
           realloc_and_scale_source(cpi, cm->cur_frame->buf.y_crop_width,
                                    cm->cur_frame->buf.y_crop_height);
     }
-    ++current_frame->frame_number;
+
+    // current_frame->frame_number is incremented already for
+    // keyframe overlays.
+    if (!av1_check_keyframe_overlay(cpi->gf_group.index, &cpi->gf_group,
+                                    cpi->rc.frames_since_key))
+      ++current_frame->frame_number;
 
     return AOM_CODEC_OK;
   }
@@ -3058,10 +3063,14 @@
   // A droppable frame might not be shown but it always
   // takes a space in the gf group. Therefore, even when
   // it is not shown, we still need update the count down.
-
   if (cm->show_frame) {
-    // Don't increment frame counters if this was an altref buffer
-    // update not a real frame
+    // Don't increment frame counters if this is a key frame overlay
+    if (!av1_check_keyframe_overlay(cpi->gf_group.index, &cpi->gf_group,
+                                    cpi->rc.frames_since_key))
+      ++current_frame->frame_number;
+  } else if (av1_check_keyframe_arf(cpi->gf_group.index, &cpi->gf_group,
+                                    cpi->rc.frames_since_key)) {
+    // TODO(bohanli) Hack here: increment kf overlay before it is encoded
     ++current_frame->frame_number;
   }
 
diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index 70c4450..c118a44 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c
@@ -2783,7 +2783,12 @@
   av1_zero(this_frame);
   // call above fn
   if (is_stat_consumption_stage(cpi)) {
-    process_first_pass_stats(cpi, &this_frame);
+    // Do not read if it is overlay for kf arf, since kf already
+    // advanced the first pass stats pointer
+    if (!av1_check_keyframe_overlay(gf_group->index, gf_group,
+                                    rc->frames_since_key)) {
+      process_first_pass_stats(cpi, &this_frame);
+    }
   } else {
     rc->active_worst_quality = oxcf->rc_cfg.cq_level;
   }
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index f7abc4d..b57c1e0 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -1015,7 +1015,8 @@
   }
   num_frames = AOMMIN(num_frames + adjust_num, lookahead_depth + 1);
 
-  if (filter_frame_lookahead_idx == -1) {  // Key frame.
+  if (filter_frame_lookahead_idx == -1 ||
+      filter_frame_lookahead_idx == 0) {  // Key frame.
     num_before = 0;
     num_after = AOMMIN(num_frames - 1, max_after);
   } else if (filter_frame_lookahead_idx < -1) {  // Key frame in one-pass mode.
@@ -1141,7 +1142,7 @@
   }
 
   // Do filtering.
-  const int is_key_frame = (filter_frame_lookahead_idx < 0);
+  const int is_key_frame = (filter_frame_lookahead_idx <= 0);
   // Setup scaling factors. Scaling on each of the arnr frames is not
   // supported.
   // ARF is produced at the native frame size and resized when coded.
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 9bc1d1b..6a828a8 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -970,7 +970,10 @@
         frame_update_type == INTNL_OVERLAY_UPDATE ||
         frame_update_type == OVERLAY_UPDATE;
     frame_params.frame_type =
-        frame_update_type == KF_UPDATE ? KEY_FRAME : INTER_FRAME;
+        (frame_update_type == KF_UPDATE ||
+         av1_check_keyframe_arf(gf_index, gf_group, cpi->rc.frames_since_key))
+            ? KEY_FRAME
+            : INTER_FRAME;
 
     if (frame_update_type == LF_UPDATE)
       *pframe_qindex = gf_group->q_val[gf_index];
@@ -1007,7 +1010,7 @@
         cpi, &frame_params, frame_update_type, &ref_buffer_stack);
 
     int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
-    av1_update_ref_frame_map(cpi, frame_update_type,
+    av1_update_ref_frame_map(cpi, frame_update_type, frame_params.frame_type,
                              frame_params.show_existing_frame,
                              refresh_frame_map_index, &ref_buffer_stack);
 
@@ -1067,7 +1070,7 @@
     int refresh_mask = av1_get_refresh_frame_flags(
         cpi, &frame_params, frame_update_type, &ref_buffer_stack);
     int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
-    av1_update_ref_frame_map(cpi, frame_update_type,
+    av1_update_ref_frame_map(cpi, frame_update_type, frame_params.frame_type,
                              frame_params.show_existing_frame,
                              refresh_frame_map_index, &ref_buffer_stack);
 
@@ -1116,7 +1119,8 @@
   cm->current_frame.frame_type = frame_params->frame_type;
   for (int gf_index = gf_group->index; gf_index < gf_group->size; ++gf_index) {
     av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame,
-                                 gf_group->update_type[gf_index], 0);
+                                 gf_group->update_type[gf_index],
+                                 cm->current_frame.frame_type, 0);
 
     memcpy(&cpi->refresh_frame, &this_frame_params.refresh_frame,
            sizeof(cpi->refresh_frame));
@@ -1174,15 +1178,18 @@
   }
 
   av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame,
-                               gf_group->update_type[gf_group->index], 0);
+                               gf_group->update_type[gf_group->index],
+                               frame_params->frame_type, 0);
   cm->current_frame.frame_type = frame_params->frame_type;
   cm->show_frame = frame_params->show_frame;
 
   if (cpi->common.tiles.large_scale) return 0;
   if (gf_group->max_layer_depth_allowed == 0) return 1;
+  assert(gf_group->arf_index >= 0);
 
   double beta[2] = { 0.0 };
-  for (int frame_idx = 1; frame_idx <= AOMMIN(tpl_gf_group_frames - 1, 2);
+  for (int frame_idx = gf_group->arf_index;
+       frame_idx <= AOMMIN(tpl_gf_group_frames - 1, gf_group->arf_index + 1);
        ++frame_idx) {
     TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
     TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
