AV1 RealTime path. Initial implementation Change-Id: If264acb0f6df620a2af2fd19706e023eadee5382

commit: 7b9d0d62bc351efe5313eb600d21d364be5e4cfb [log] [tgz]
author: kyslov <kyslov@google.com> Fri Dec 21 11:12:26 2018 -0800
committer: Fyodor Kyslov <kyslov@google.com> Wed Feb 06 19:03:07 2019 +0000
tree: 3ed2d536fd5143bb4302849631fef5885ab19c17
parent: 3ff6ff36eb9b2535efdc236c848973c5cae5594d [diff]
diff --git a/aom/aom_encoder.h b/aom/aom_encoder.h
index 60c1c82..f8a7cec 100644
--- a/aom/aom_encoder.h
+++ b/aom/aom_encoder.h

@@ -861,6 +861,11 @@
  */
 aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);
 
+/*!\brief usage parameter analogous to AV1 GOOD QUALITY mode. */
+#define AOM_USAGE_GOOD_QUALITY (0)
+/*!\brief usage parameter analogous to AV1 REALTIME mode. */
+#define AOM_USAGE_REALTIME (1)
+
 /*!\brief Encode a frame
  *
  * Encodes a video frame at the given "presentation time." The presentation

diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 927e352..c83a99f 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl

@@ -836,6 +836,18 @@
   specialize qw/aom_highbd_sad64x16x4d sse2/;
 
   #
+  # Avg
+  #
+  add_proto qw/unsigned int aom_avg_8x8/, "const uint8_t *, int p";
+  specialize qw/aom_avg_8x8 sse2/;
+
+  add_proto qw/unsigned int aom_avg_4x4/, "const uint8_t *, int p";
+  specialize qw/aom_avg_4x4 sse2/;
+
+  add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  specialize qw/aom_minmax_8x8 sse2/;
+
+  #
   # hamadard transform and satd for implmenting temporal dependency model
   #
   add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";

diff --git a/aom_dsp/avg.c b/aom_dsp/avg.c
index 4d78c9c..a97cd0f 100644
--- a/aom_dsp/avg.c
+++ b/aom_dsp/avg.c

@@ -14,6 +14,40 @@
 #include "config/aom_dsp_rtcd.h"
 #include "aom_ports/mem.h"
 
+void aom_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
+                      int *min, int *max) {
+  int i, j;
+  *min = 255;
+  *max = 0;
+  for (i = 0; i < 8; ++i, s += p, d += dp) {
+    for (j = 0; j < 8; ++j) {
+      int diff = abs(s[j] - d[j]);
+      *min = diff < *min ? diff : *min;
+      *max = diff > *max ? diff : *max;
+    }
+  }
+}
+
+unsigned int aom_avg_4x4_c(const uint8_t *s, int p) {
+  int i, j;
+  int sum = 0;
+  for (i = 0; i < 4; ++i, s += p)
+    for (j = 0; j < 4; sum += s[j], ++j) {
+    }
+
+  return (sum + 8) >> 4;
+}
+
+unsigned int aom_avg_8x8_c(const uint8_t *s, int p) {
+  int i, j;
+  int sum = 0;
+  for (i = 0; i < 8; ++i, s += p)
+    for (j = 0; j < 8; sum += s[j], ++j) {
+    }
+
+  return (sum + 32) >> 6;
+}
+
 // src_diff: first pass, 9 bit, dynamic range [-255, 255]
 //           second pass, 12 bit, dynamic range [-2040, 2040]
 static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,

diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index 969e4e1..0c20261 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c

@@ -16,6 +16,129 @@
 #include "aom_dsp/x86/bitdepth_conversion_sse2.h"
 #include "aom_ports/mem.h"
 
+void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
+                         int *min, int *max) {
+  __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
+  u0 = _mm_setzero_si128();
+  // Row 0
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff0 = _mm_max_epi16(diff, negdiff);
+  // Row 1
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
+  minabsdiff = _mm_min_epi16(absdiff0, absdiff);
+  // Row 2
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 3
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 4
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 5
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 6
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 7
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
+  *max = _mm_extract_epi16(maxabsdiff, 0);
+
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
+  *min = _mm_extract_epi16(minabsdiff, 0);
+}
+
+unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) {
+  __m128i s0, s1, u0;
+  unsigned int avg = 0;
+  u0 = _mm_setzero_si128();
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+  avg = _mm_extract_epi16(s0, 0);
+  return (avg + 32) >> 6;
+}
+
+unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
+  __m128i s0, s1, u0;
+  unsigned int avg = 0;
+  u0 = _mm_setzero_si128();
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+  avg = _mm_extract_epi16(s0, 0);
+  return (avg + 8) >> 4;
+}
+
 static void hadamard_col8_sse2(__m128i *in, int iter) {
   __m128i a0 = in[0];
   __m128i a1 = in[1];

diff --git a/apps/aomenc.c b/apps/aomenc.c
index 83c1353..0123354 100644
--- a/apps/aomenc.c
+++ b/apps/aomenc.c

@@ -154,6 +154,8 @@
     ARG_DEF(NULL, "skip", 1, "Skip the first n input frames");
 static const arg_def_t good_dl =
     ARG_DEF(NULL, "good", 0, "Use Good Quality Deadline");
+static const arg_def_t rt_dl =
+    ARG_DEF(NULL, "rt", 0, "Use Realtime Quality Deadline");
 static const arg_def_t quietarg =
     ARG_DEF("q", "quiet", 0, "Do not print encode progress");
 static const arg_def_t verbosearg =
@@ -219,6 +221,7 @@
                                         &limit,
                                         &skip,
                                         &good_dl,
+                                        &rt_dl,
                                         &quietarg,
                                         &verbosearg,
                                         &psnrarg,
@@ -1064,7 +1067,9 @@
     } else if (arg_match(&arg, &usage, argi))
       global->usage = arg_parse_uint(&arg);
     else if (arg_match(&arg, &good_dl, argi))
-      warn("Deprecated --good option! Ignoring\n");
+      global->usage = AOM_USAGE_GOOD_QUALITY;  // Good quality usage
+    else if (arg_match(&arg, &rt_dl, argi))
+      global->usage = AOM_USAGE_REALTIME;  // Real-time usage
     else if (arg_match(&arg, &use_yv12, argi))
       global->color_type = YV12;
     else if (arg_match(&arg, &use_i420, argi))
@@ -1117,11 +1122,19 @@
     // Make default AV1 passes = 2 until there is a better quality 1-pass
     // encoder
     if (global->codec != NULL && global->codec->name != NULL)
-      global->passes = (strcmp(global->codec->name, "av1") == 0) ? 2 : 1;
+      global->passes = (strcmp(global->codec->name, "av1") == 0 &&
+                        global->usage != AOM_USAGE_REALTIME)
+                           ? 2
+                           : 1;
 #else
     global->passes = 1;
 #endif
   }
+
+  if (global->usage == AOM_USAGE_REALTIME && global->passes > 1) {
+    warn("Enforcing one-pass encoding in realtime mode\n");
+    global->passes = 1;
+  }
 }
 
 static void open_input_file(struct AvxInputContext *input,
@@ -1355,6 +1368,12 @@
       config->cfg.g_error_resilient = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &lag_in_frames, argi)) {
       config->cfg.g_lag_in_frames = arg_parse_uint(&arg);
+      if (global->usage == AOM_USAGE_REALTIME &&
+          config->cfg.rc_end_usage == AOM_CBR &&
+          config->cfg.g_lag_in_frames != 0) {
+        warn("non-zero %s option ignored in realtime CBR mode.\n", arg.name);
+        config->cfg.g_lag_in_frames = 0;
+      }
     } else if (arg_match(&arg, &large_scale_tile, argi)) {
       config->cfg.large_scale_tile = arg_parse_uint(&arg);
       if (config->cfg.large_scale_tile) global->codec = get_aom_lst_encoder();

diff --git a/av1/av1.cmake b/av1/av1.cmake
index 0b4901a..e99a2ed 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake

@@ -194,6 +194,8 @@
             "${AOM_ROOT}/av1/encoder/tpl_model.c"
             "${AOM_ROOT}/av1/encoder/tpl_model.h"
             "${AOM_ROOT}/av1/encoder/wedge_utils.c"
+            "${AOM_ROOT}/av1/encoder/var_based_part.c"
+            "${AOM_ROOT}/av1/encoder/var_based_part.h"
             "${AOM_ROOT}/third_party/fastfeat/fast.c"
             "${AOM_ROOT}/third_party/fastfeat/fast.h"
             "${AOM_ROOT}/third_party/fastfeat/fast_9.c"

diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 4e84958..54531e5 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c

@@ -295,6 +295,7 @@
   RANGE_CHECK_HI(extra_cfg, aq_mode, AQ_MODE_COUNT - 1);
   RANGE_CHECK_HI(extra_cfg, deltaq_mode, DELTAQ_MODE_COUNT - 1);
   RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1);
+  RANGE_CHECK_HI(cfg, g_usage, 1);
   RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS);
   RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
   RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q);
@@ -498,6 +499,7 @@
   oxcf->profile = cfg->g_profile;
   oxcf->fwd_kf_enabled = cfg->fwd_kf_enabled;
   oxcf->max_threads = (int)cfg->g_threads;
+  oxcf->mode = (cfg->g_usage == 1) ? REALTIME : GOOD;
   oxcf->width = cfg->g_w;
   oxcf->height = cfg->g_h;
   oxcf->forced_max_frame_width = cfg->g_forced_max_frame_width;
@@ -540,7 +542,6 @@
     oxcf->init_framerate = 30;
     oxcf->timing_info_present = 0;
   }
-  oxcf->mode = GOOD;
   oxcf->cfg = &cfg->cfg;
 
   switch (cfg->g_pass) {
@@ -1507,8 +1508,7 @@
       }
     }
   }
-
-  if (ctx->oxcf.mode != GOOD) {
+  if (ctx->oxcf.mode != GOOD && ctx->oxcf.mode != REALTIME) {
     ctx->oxcf.mode = GOOD;
     av1_change_config(ctx->cpi, &ctx->oxcf);
   }
@@ -2093,7 +2093,7 @@
   { 0,
     {
         // NOLINT
-        0,  // g_usage
+        0,  // g_usage - non-realtime usage
         0,  // g_threads
         0,  // g_profile
 
@@ -2158,6 +2158,74 @@
         { 0 },        // tile_heights
         { 1 },        // config file
     } },
+  { 1,
+    {
+        // NOLINT
+        1,  // g_usage - real-time usage
+        0,  // g_threads
+        0,  // g_profile
+
+        320,         // g_width
+        240,         // g_height
+        0,           // g_limit
+        0,           // g_forced_max_frame_width
+        0,           // g_forced_max_frame_height
+        AOM_BITS_8,  // g_bit_depth
+        8,           // g_input_bit_depth
+
+        { 1, 30 },  // g_timebase
+
+        0,  // g_error_resilient
+
+        AOM_RC_ONE_PASS,  // g_pass
+
+        1,  // g_lag_in_frames
+
+        0,                // rc_dropframe_thresh
+        RESIZE_NONE,      // rc_resize_mode
+        SCALE_NUMERATOR,  // rc_resize_denominator
+        SCALE_NUMERATOR,  // rc_resize_kf_denominator
+
+        0,                // rc_superres_mode
+        SCALE_NUMERATOR,  // rc_superres_denominator
+        SCALE_NUMERATOR,  // rc_superres_kf_denominator
+        63,               // rc_superres_qthresh
+        32,               // rc_superres_kf_qthresh
+
+        AOM_CBR,      // rc_end_usage
+        { NULL, 0 },  // rc_twopass_stats_in
+        { NULL, 0 },  // rc_firstpass_mb_stats_in
+        256,          // rc_target_bandwidth
+        0,            // rc_min_quantizer
+        63,           // rc_max_quantizer
+        25,           // rc_undershoot_pct
+        25,           // rc_overshoot_pct
+
+        6000,  // rc_max_buffer_size
+        4000,  // rc_buffer_initial_size
+        5000,  // rc_buffer_optimal_size
+
+        50,    // rc_two_pass_vbrbias
+        0,     // rc_two_pass_vbrmin_section
+        2000,  // rc_two_pass_vbrmax_section
+
+        // keyframing settings (kf)
+        0,            // fwd_kf_enabled
+        AOM_KF_AUTO,  // g_kfmode
+        0,            // kf_min_dist
+        9999,         // kf_max_dist
+        0,            // sframe_dist
+        1,            // sframe_mode
+        0,            // large_scale_tile
+        0,            // monochrome
+        0,            // full_still_picture_hdr
+        0,            // save_as_annexb
+        0,            // tile_width_count
+        0,            // tile_height_count
+        { 0 },        // tile_widths
+        { 0 },        // tile_heights
+        { 1 },        // config file
+    } },
 };
 
 #ifndef VERSION_STRING
@@ -2181,7 +2249,7 @@
   },
   {
       // NOLINT
-      1,                           // 1 cfg map
+      2,                           // 2 cfg map
       encoder_usage_cfg_map,       // aom_codec_enc_cfg_map_t
       encoder_encode,              // aom_codec_encode_fn_t
       encoder_get_cxdata,          // aom_codec_get_cx_data_fn_t

diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 94bd4f0..9774678 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c

@@ -60,6 +60,7 @@
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/tokenize.h"
+#include "av1/encoder/var_based_part.h"
 
 static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
                               ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
@@ -74,7 +75,7 @@
 //  purposes of activity masking.
 // Eventually this should be replaced by custom no-reference routines,
 //  which will be faster.
-static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = {
+const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = {
   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
@@ -221,18 +222,6 @@
     return BLOCK_8X8;
 }
 
-// Lighter version of set_offsets that only sets the mode info
-// pointers.
-static void set_mode_info_offsets(const AV1_COMP *const cpi,
-                                  MACROBLOCK *const x, MACROBLOCKD *const xd,
-                                  int mi_row, int mi_col) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int idx_str = xd->mi_stride * mi_row + mi_col;
-  xd->mi = cm->mi_grid_visible + idx_str;
-  xd->mi[0] = cm->mi + idx_str;
-  x->mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
-}
-
 static void set_offsets_without_segment_id(const AV1_COMP *const cpi,
                                            const TileInfo *const tile,
                                            MACROBLOCK *const x, int mi_row,
@@ -524,11 +513,11 @@
   return av1_edge_exists(ref->buf, ref->stride, width, height, high_bd, bd);
 }
 
-static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
-                             MACROBLOCK *const x, int mi_row, int mi_col,
-                             RD_STATS *rd_cost, PARTITION_TYPE partition,
-                             BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                             int64_t best_rd) {
+static void pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
+                          MACROBLOCK *const x, int mi_row, int mi_col,
+                          RD_STATS *rd_cost, PARTITION_TYPE partition,
+                          BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                          int64_t best_rd, int use_nonrd_pick_mode) {
   AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   TileInfo *const tile_info = &tile_data->tile_info;
@@ -657,8 +646,15 @@
       ctx->seg_feat = 1;
 #endif
     } else {
-      av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
-                                bsize, ctx, best_rd);
+      // TODO(kyslov): do the same for pick_intra_mode and
+      //               pick_inter_mode_sb_seg_skip
+      if (use_nonrd_pick_mode) {
+        av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+                                     bsize, ctx, best_rd);
+      } else {
+        av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+                                  bsize, ctx, best_rd);
+      }
 #if CONFIG_ONE_PASS_SVM
       ctx->seg_feat = 0;
 #endif
@@ -1627,25 +1623,6 @@
   update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
 }
 
-// Check to see if the given partition size is allowed for a specified number
-// of mi block rows and columns remaining in the image.
-// If not then return the largest allowed partition size
-static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
-                                      int cols_left, int *bh, int *bw) {
-  if (rows_left <= 0 || cols_left <= 0) {
-    return AOMMIN(bsize, BLOCK_8X8);
-  } else {
-    for (; bsize > 0; bsize -= 3) {
-      *bh = mi_size_high[bsize];
-      *bw = mi_size_wide[bsize];
-      if ((*bh <= rows_left) && (*bw <= cols_left)) {
-        break;
-      }
-    }
-  }
-  return bsize;
-}
-
 static void set_partial_sb_partition(const AV1_COMMON *const cm,
                                      MB_MODE_INFO *mi, int bh_in, int bw_in,
                                      int mi_rows_remaining,
@@ -1769,8 +1746,8 @@
     if (partition != PARTITION_NONE && !splits_below &&
         mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
       pc_tree->partitioning = PARTITION_NONE;
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
-                       PARTITION_NONE, bsize, ctx_none, INT64_MAX);
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
+                    PARTITION_NONE, bsize, ctx_none, INT64_MAX, 0);
 
       if (none_rdc.rate < INT_MAX) {
         none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
@@ -1798,13 +1775,13 @@
   }
   switch (partition) {
     case PARTITION_NONE:
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                       PARTITION_NONE, bsize, ctx_none, INT64_MAX);
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_NONE, bsize, ctx_none, INT64_MAX, 0);
       break;
     case PARTITION_HORZ:
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                       PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
-                       INT64_MAX);
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_HORZ, subsize, &pc_tree->horizontal[0], INT64_MAX,
+                    0);
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_row + hbs < cm->mi_rows) {
         RD_STATS tmp_rdc;
@@ -1813,9 +1790,9 @@
         update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
         encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
                           mi_col, subsize, NULL);
-        rd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
-                         PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
-                         INT64_MAX);
+        pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
+                      PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+                      INT64_MAX, 0);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           av1_invalid_rd_stats(&last_part_rdc);
           break;
@@ -1826,9 +1803,9 @@
       }
       break;
     case PARTITION_VERT:
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                       PARTITION_VERT, subsize, &pc_tree->vertical[0],
-                       INT64_MAX);
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_VERT, subsize, &pc_tree->vertical[0], INT64_MAX,
+                    0);
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_col + hbs < cm->mi_cols) {
         RD_STATS tmp_rdc;
@@ -1837,9 +1814,9 @@
         update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1);
         encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
                           mi_col, subsize, NULL);
-        rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
-                         PARTITION_VERT, subsize,
-                         &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX);
+        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
+                      PARTITION_VERT, subsize,
+                      &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX, 0);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           av1_invalid_rd_stats(&last_part_rdc);
           break;
@@ -1913,9 +1890,9 @@
 
       save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
       pc_tree->split[i]->partitioning = PARTITION_NONE;
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
-                       &tmp_rdc, PARTITION_SPLIT, split_subsize,
-                       &pc_tree->split[i]->none, INT64_MAX);
+      pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
+                    PARTITION_SPLIT, split_subsize, &pc_tree->split[i]->none,
+                    INT64_MAX, 0);
 
       restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
       if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
@@ -1976,6 +1953,186 @@
   *dist = chosen_rdc.dist;
 }
 
+// TODO(kyslov): now this is very similar to rd_use_partition (except that
+// doesn't do extra search arounf suggested partitioning)
+//               consider passing a flag to select non-rd path (similar to
+//               encode_sb_row)
+static void nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
+                                TileDataEnc *tile_data, MB_MODE_INFO **mib,
+                                TOKENEXTRA **tp, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize, int *rate, int64_t *dist,
+                                int do_recon, PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  int i;
+  const int pl = (bsize >= BLOCK_8X8)
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                     : 0;
+  const PARTITION_TYPE partition =
+      (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+                           : PARTITION_NONE;
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  RD_STATS last_part_rdc;
+  PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+  av1_invalid_rd_stats(&last_part_rdc);
+
+  pc_tree->partitioning = partition;
+
+  xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+    x->mb_energy = av1_log_block_var(cpi, x, bsize);
+  }
+
+  for (int b = 0; b < 2; ++b) {
+    pc_tree->horizontal[b].skip_ref_frame_mask = 0;
+    pc_tree->vertical[b].skip_ref_frame_mask = 0;
+  }
+  for (int b = 0; b < 3; ++b) {
+    pc_tree->horizontala[b].skip_ref_frame_mask = 0;
+    pc_tree->horizontalb[b].skip_ref_frame_mask = 0;
+    pc_tree->verticala[b].skip_ref_frame_mask = 0;
+    pc_tree->verticalb[b].skip_ref_frame_mask = 0;
+  }
+  for (int b = 0; b < 4; ++b) {
+    pc_tree->horizontal4[b].skip_ref_frame_mask = 0;
+    pc_tree->vertical4[b].skip_ref_frame_mask = 0;
+  }
+  switch (partition) {
+    case PARTITION_NONE:
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_NONE, bsize, ctx_none, INT64_MAX, 1);
+      break;
+    case PARTITION_HORZ:
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_HORZ, subsize, &pc_tree->horizontal[0], INT64_MAX,
+                    1);
+      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+          mi_row + hbs < cm->mi_rows) {
+        RD_STATS tmp_rdc;
+        const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
+        av1_init_rd_stats(&tmp_rdc);
+        update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
+                          mi_col, subsize, NULL);
+        pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
+                      PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+                      INT64_MAX, 1);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+      }
+      break;
+    case PARTITION_VERT:
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_VERT, subsize, &pc_tree->vertical[0], INT64_MAX,
+                    1);
+      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+          mi_col + hbs < cm->mi_cols) {
+        RD_STATS tmp_rdc;
+        const PICK_MODE_CONTEXT *const ctx_v = &pc_tree->vertical[0];
+        av1_init_rd_stats(&tmp_rdc);
+        update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
+                          mi_col, subsize, NULL);
+        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
+                      PARTITION_VERT, subsize,
+                      &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX, 1);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+      }
+      break;
+    case PARTITION_SPLIT:
+      last_part_rdc.rate = 0;
+      last_part_rdc.dist = 0;
+      last_part_rdc.rdcost = 0;
+      for (i = 0; i < 4; i++) {
+        int x_idx = (i & 1) * hbs;
+        int y_idx = (i >> 1) * hbs;
+        int jj = i >> 1, ii = i & 0x01;
+        RD_STATS tmp_rdc;
+        if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+          continue;
+
+        av1_init_rd_stats(&tmp_rdc);
+        nonrd_use_partition(
+            cpi, td, tile_data, mib + jj * hbs * cm->mi_stride + ii * hbs, tp,
+            mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
+            &tmp_rdc.dist, i != 3, pc_tree->split[i]);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+      }
+      break;
+    case PARTITION_VERT_A:
+    case PARTITION_VERT_B:
+    case PARTITION_HORZ_A:
+    case PARTITION_HORZ_B:
+    case PARTITION_HORZ_4:
+    case PARTITION_VERT_4:
+      assert(0 && "Cannot handle extended partition types");
+    default: assert(0); break;
+  }
+
+  if (last_part_rdc.rate < INT_MAX) {
+    last_part_rdc.rate += x->partition_cost[pl][partition];
+    last_part_rdc.rdcost =
+        RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
+  }
+
+  restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  // We must have chosen a partitioning and encoding or we'll fail later on.
+  // No other opportunities for success.
+  if (bsize == cm->seq_params.sb_size)
+    assert(last_part_rdc.rate < INT_MAX && last_part_rdc.dist < INT64_MAX);
+
+  if (do_recon) {
+    if (bsize == cm->seq_params.sb_size) {
+      // NOTE: To get estimate for rate due to the tokens, use:
+      // int rate_coeffs = 0;
+      // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+      //           bsize, pc_tree, &rate_coeffs);
+      x->cb_offset = 0;
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
+  }
+
+  *rate = last_part_rdc.rate;
+  *dist = last_part_rdc.dist;
+}
+
 /* clang-format off */
 static const BLOCK_SIZE min_partition_size[BLOCK_SIZES_ALL] = {
                             BLOCK_4X4,    //                     4x4
@@ -2405,9 +2562,9 @@
                                        ? INT64_MAX
                                        : (best_rdc->rdcost - sum_rdc->rdcost);
 
-  rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc,
-                   RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx,
-                   rdcost_remaining);
+  pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc,
+                RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx,
+                rdcost_remaining, 0);
 
   if (this_rdc->rate == INT_MAX) {
     sum_rdc->rdcost = INT64_MAX;
@@ -2623,8 +2780,8 @@
     const int64_t best_remain_rdcost =
         best_rdc.rdcost == INT64_MAX ? INT64_MAX
                                      : (best_rdc.rdcost - partition_rd_cost);
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
-                     PARTITION_NONE, bsize, ctx_none, best_remain_rdcost);
+    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_NONE,
+                  bsize, ctx_none, best_remain_rdcost, 0);
 
     pc_tree->pc_tree_stats.rdcost = ctx_none->rdcost;
     pc_tree->pc_tree_stats.skip = ctx_none->skip;
@@ -4103,8 +4260,8 @@
       partition_attempts[PARTITION_NONE] += 1;
     }
 #endif
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
-                     PARTITION_NONE, bsize, ctx_none, best_remain_rdcost);
+    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_NONE,
+                  bsize, ctx_none, best_remain_rdcost, 0);
     pb_source_variance = x->source_variance;
     if (none_rd) *none_rd = this_rdc.rdcost;
     cur_none_rd = this_rdc.rdcost;
@@ -4433,9 +4590,8 @@
       partition_attempts[PARTITION_HORZ] += 1;
     }
 #endif
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
-                     PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
-                     best_remain_rdcost);
+    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_HORZ,
+                  subsize, &pc_tree->horizontal[0], best_remain_rdcost, 0);
 
     if (this_rdc.rate == INT_MAX) {
       sum_rdc.rdcost = INT64_MAX;
@@ -4465,9 +4621,9 @@
         pc_tree->horizontal[1].pred_interp_filter =
             av1_extract_interp_filter(ctx_h->mic.interp_filters, 0);
       }
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
-                       PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
-                       best_rdc.rdcost - sum_rdc.rdcost);
+      pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
+                    PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+                    best_rdc.rdcost - sum_rdc.rdcost, 0);
       horz_rd[1] = this_rdc.rdcost;
 
       if (this_rdc.rate == INT_MAX) {
@@ -4515,9 +4671,8 @@
       partition_attempts[PARTITION_VERT] += 1;
     }
 #endif
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
-                     PARTITION_VERT, subsize, &pc_tree->vertical[0],
-                     best_remain_rdcost);
+    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_VERT,
+                  subsize, &pc_tree->vertical[0], best_remain_rdcost, 0);
 
     if (this_rdc.rate == INT_MAX) {
       sum_rdc.rdcost = INT64_MAX;
@@ -4546,9 +4701,9 @@
         pc_tree->vertical[1].pred_interp_filter =
             av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
       }
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
-                       PARTITION_VERT, subsize, &pc_tree->vertical[1],
-                       best_rdc.rdcost - sum_rdc.rdcost);
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
+                    PARTITION_VERT, subsize, &pc_tree->vertical[1],
+                    best_rdc.rdcost - sum_rdc.rdcost, 0);
       vert_rd[1] = this_rdc.rdcost;
 
       if (this_rdc.rate == INT_MAX) {
@@ -5531,9 +5686,8 @@
               CFL_ALPHABET_SIZE);
 }
 
-static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
-                             TileDataEnc *tile_data, int mi_row,
-                             TOKENEXTRA **tp) {
+static void encode_sb_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
+                          int mi_row, TOKENEXTRA **tp, int use_nonrd_mode) {
   AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   const TileInfo *const tile_info = &tile_data->tile_info;
@@ -5642,6 +5796,13 @@
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
                        &dummy_rate, &dummy_dist, 1, pc_root);
+    } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
+               use_nonrd_mode) {
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+      av1_choose_var_based_partitioning(cpi, tile_info, x, mi_row, mi_col);
+      nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                          &dummy_rate, &dummy_dist, 1, pc_root);
+
     } else {
       const int orig_rdmult = cpi->rd.RDMULT;
       x->cb_rdmult = orig_rdmult;
@@ -5812,7 +5973,7 @@
                 cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
   cpi->tplist[tile_row][tile_col][sb_row_in_tile].start = tok;
 
-  encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+  encode_sb_row(cpi, td, this_tile, mi_row, &tok, cpi->sf.use_nonrd_pick_mode);
 
   cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop = tok;
   cpi->tplist[tile_row][tile_col][sb_row_in_tile].count =

diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 10ca859..57cce12 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c

@@ -71,6 +71,7 @@
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/speed_features.h"
 #include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/var_based_part.h"
 
 #define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7
 
@@ -4729,6 +4730,9 @@
     }
     av1_set_quantizer(cm, q);
     av1_init_quantizer(cpi);
+
+    av1_set_variance_partition_thresholds(cpi, q, 0);
+
     // printf("Frame %d/%d: q = %d, frame_type = %d superres_denom = %d\n",
     //        cm->current_frame.frame_number, cm->show_frame, q,
     //        cm->current_frame.frame_type, cm->superres_scale_denominator);

diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 7b0af8f..9942280 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h

@@ -24,6 +24,8 @@
 #include "av1/common/onyxc_int.h"
 #include "av1/common/resize.h"
 #include "av1/common/timing.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/context_tree.h"
@@ -36,6 +38,7 @@
 #include "av1/encoder/rd.h"
 #include "av1/encoder/speed_features.h"
 #include "av1/encoder/tokenize.h"
+#include "av1/encoder/block.h"
 
 #if CONFIG_INTERNAL_STATS
 #include "aom_dsp/ssim.h"
@@ -85,7 +88,10 @@
 enum {
   // Good Quality Fast Encoding. The encoder balances quality with the amount of
   // time it takes to encode the output. Speed setting controls how fast.
-  GOOD
+  GOOD,
+  // Realtime Fast Encoding. Will force some restrictions on bitrate
+  // constraints.
+  REALTIME
 } UENUM1BYTE(MODE);
 
 enum {
@@ -132,6 +138,16 @@
   SUPERRES_MODES
 } UENUM1BYTE(SUPERRES_MODE);
 
+typedef enum {
+  kInvalid = 0,
+  kLowSadLowSumdiff = 1,
+  kLowSadHighSumdiff = 2,
+  kHighSadLowSumdiff = 3,
+  kHighSadHighSumdiff = 4,
+  kLowVarHighSumdiff = 5,
+  kVeryHighSad = 6,
+} CONTENT_STATE_SB;
+
 typedef struct TplDepStats {
   int64_t intra_cost;
   int64_t inter_cost;
@@ -825,6 +841,16 @@
   // VARIANCE_AQ segment map refresh
   int vaq_refresh;
 
+  // VAR_BASED_PARTITION thresholds
+  // 0 - threshold_128x128; 1 - threshold_64x64;
+  // 2 - threshold_32x32; 3 - threshold_16x16;
+  // 4 - vbp_threshold_8x8;
+  int64_t vbp_thresholds[5];
+  int64_t vbp_threshold_minmax;
+  int64_t vbp_threshold_sad;
+  int64_t vbp_threshold_copy;
+  BLOCK_SIZE vbp_bsize_min;
+
   // Multi-threading
   int num_workers;
   AVxWorker *workers;
@@ -1113,6 +1139,39 @@
                                      cm->current_frame.frame_type == KEY_FRAME);
 }
 
+// Lighter version of set_offsets that only sets the mode info
+// pointers.
+static INLINE void set_mode_info_offsets(const AV1_COMP *const cpi,
+                                         MACROBLOCK *const x,
+                                         MACROBLOCKD *const xd, int mi_row,
+                                         int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int idx_str = xd->mi_stride * mi_row + mi_col;
+  xd->mi = cm->mi_grid_visible + idx_str;
+  xd->mi[0] = cm->mi + idx_str;
+  x->mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+}
+
+// Check to see if the given partition size is allowed for a specified number
+// of mi block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static INLINE BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
+                                             int cols_left, int *bh, int *bw) {
+  int int_size = (int)bsize;
+  if (rows_left <= 0 || cols_left <= 0) {
+    return AOMMIN(bsize, BLOCK_8X8);
+  } else {
+    for (; int_size > 0; int_size -= 3) {
+      *bh = mi_size_high[int_size];
+      *bw = mi_size_wide[int_size];
+      if ((*bh <= rows_left) && (*bw <= cols_left)) {
+        break;
+      }
+    }
+  }
+  return (BLOCK_SIZE)int_size;
+}
+
 // Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon
 // failure. When a non-NULL aom_fixed_buf_t pointer is returned by this
 // function, the memory must be freed by the caller. Both the buf member of the

diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 7598d7c..cc7ad97 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c

@@ -11107,6 +11107,121 @@
   }
   x->comp_rd_stats_idx = 0;
 }
+// TODO(kyslov): now this is very similar to set_params_rd_pick_inter_mode
+// (except that doesn't set ALTREF parameters)
+//               consider passing a flag to select non-rd path (similar to
+//               encode_sb_row)
+static void set_params_nonrd_pick_inter_mode(
+    const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args,
+    BLOCK_SIZE bsize, int mi_row, int mi_col, mode_skip_mask_t *mode_skip_mask,
+    int skip_ref_frame_mask, unsigned int ref_costs_single[REF_FRAMES],
+    unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES],
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  unsigned char segment_id = mbmi->segment_id;
+  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+                                   MAX_SB_SIZE >> 1 };
+  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+                                    MAX_SB_SIZE >> 1 };
+  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+  for (int i = 0; i < MB_MODE_COUNT; ++i)
+    for (int k = 0; k < REF_FRAMES; ++k) args->single_filter[i][k] = SWITCHABLE;
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
+    args->above_pred_buf[1] =
+        CONVERT_TO_BYTEPTR(x->above_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+    args->above_pred_buf[2] =
+        CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
+    args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
+    args->left_pred_buf[1] =
+        CONVERT_TO_BYTEPTR(x->left_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+    args->left_pred_buf[2] =
+        CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
+  } else {
+    args->above_pred_buf[0] = x->above_pred_buf;
+    args->above_pred_buf[1] = x->above_pred_buf + (MAX_SB_SQUARE >> 1);
+    args->above_pred_buf[2] = x->above_pred_buf + MAX_SB_SQUARE;
+    args->left_pred_buf[0] = x->left_pred_buf;
+    args->left_pred_buf[1] = x->left_pred_buf + (MAX_SB_SQUARE >> 1);
+    args->left_pred_buf[2] = x->left_pred_buf + MAX_SB_SQUARE;
+  }
+
+  av1_collect_neighbors_ref_counts(xd);
+
+  estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single,
+                           ref_costs_comp);
+
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    x->pred_mv_sad[ref_frame] = INT_MAX;
+    x->mbmi_ext->mode_context[ref_frame] = 0;
+    x->mbmi_ext->compound_mode_context[ref_frame] = 0;
+    mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
+    if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame]) {
+      if (mbmi->partition != PARTITION_NONE &&
+          mbmi->partition != PARTITION_SPLIT) {
+        if (skip_ref_frame_mask & (1 << ref_frame)) {
+          int skip = 1;
+          for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+            if (!(skip_ref_frame_mask & (1 << r))) {
+              const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+              if (rf[0] == ref_frame || rf[1] == ref_frame) {
+                skip = 0;
+                break;
+              }
+            }
+          }
+          if (skip) continue;
+        }
+      }
+      assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL);
+      setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
+                                 yv12_mb);
+    }
+  }
+
+  av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
+
+  if (check_num_overlappable_neighbors(mbmi) &&
+      is_motion_variation_allowed_bsize(bsize)) {
+    av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col,
+                                        args->above_pred_buf, dst_width1,
+                                        dst_height1, args->above_pred_stride);
+    av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col,
+                                       args->left_pred_buf, dst_width2,
+                                       dst_height2, args->left_pred_stride);
+    av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col,
+                         0, num_planes);
+    calc_target_weighted_pred(
+        cm, x, xd, mi_row, mi_col, args->above_pred_buf[0],
+        args->above_pred_stride[0], args->left_pred_buf[0],
+        args->left_pred_stride[0]);
+  }
+
+  init_mode_skip_mask(mode_skip_mask, cpi, x, bsize);
+
+  if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
+    x->use_default_intra_tx_type = 1;
+  else
+    x->use_default_intra_tx_type = 0;
+
+  if (cpi->sf.tx_type_search.fast_inter_tx_type_search)
+    x->use_default_inter_tx_type = 1;
+  else
+    x->use_default_inter_tx_type = 0;
+  if (cpi->sf.skip_repeat_interpolation_filter_search) {
+    x->interp_filter_stats_idx[0] = 0;
+    x->interp_filter_stats_idx[1] = 0;
+  }
+}
 
 static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
                                 int mi_col, RD_STATS *rd_cost,
@@ -12582,7 +12697,6 @@
 #endif
     }
   }
-
   // In effect only when speed >= 2.
   sf_refine_fast_tx_type_search(
       cpi, x, mi_row, mi_col, rd_cost, bsize, ctx, search_state.best_mode_index,
@@ -12594,7 +12708,6 @@
     search_palette_mode(cpi, x, mi_row, mi_col, rd_cost, ctx, bsize, mbmi, pmi,
                         ref_costs_single, &search_state);
   }
-
   search_state.best_mbmode.skip_mode = 0;
   if (cm->current_frame.skip_mode_info.skip_mode_flag &&
       !segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
@@ -12672,6 +12785,535 @@
   }
 }
 
+// TODO(kyslov): now this is very similar to av1_rd_pick_inter_mode_sb except:
+//                 it only checks non-compound mode and
+//                 it doesn't check palette mode
+//                 it doesn't refine tx search
+//               this function is likely to be heavily modified with nonrd mode
+//               decision
+void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
+                                  MACROBLOCK *x, int mi_row, int mi_col,
+                                  RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                  PICK_MODE_CONTEXT *ctx,
+                                  int64_t best_rd_so_far) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const struct segmentation *const seg = &cm->seg;
+  PREDICTION_MODE this_mode;
+  unsigned char segment_id = mbmi->segment_id;
+  int i;
+  struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
+  unsigned int ref_costs_single[REF_FRAMES];
+  unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+  int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)];
+  mode_skip_mask_t mode_skip_mask;
+  uint8_t motion_mode_skip_mask = 0;  // second pass of single ref modes
+#if CONFIG_ONE_PASS_SVM
+  int temp_y_eob = 0, temp_y_eob_0 = 0, temp_y_eob_1 = 0, temp_y_eob_2 = 0,
+      temp_y_eob_3 = 0;
+  int64_t temp_y_rd = 0, temp_y_rd_0 = 0, temp_y_rd_1 = 0, temp_y_rd_2 = 0,
+          temp_y_rd_3 = 0;
+#endif
+
+  InterModeSearchState search_state;
+  init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize,
+                               best_rd_so_far);
+  INTERINTRA_MODE interintra_modes[REF_FRAMES] = {
+    INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES,
+    INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES
+  };
+  HandleInterModeArgs args = {
+    { NULL },  { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
+    { NULL },  { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 },
+    NULL,      NULL,
+    NULL,      search_state.modelled_rd,
+    { { 0 } }, INT_MAX,
+    INT_MAX,   search_state.simple_rd,
+    0,         interintra_modes,
+    1,         NULL
+  };
+  for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+
+  av1_invalid_rd_stats(rd_cost);
+
+  // init params, set frame modes, speed features
+  set_params_nonrd_pick_inter_mode(cpi, x, &args, bsize, mi_row, mi_col,
+                                   &mode_skip_mask, ctx->skip_ref_frame_mask,
+                                   ref_costs_single, ref_costs_comp, yv12_mb);
+
+  int64_t best_est_rd = INT64_MAX;
+  // TODO(angiebird): Turn this on when this speed feature is well tested
+  const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+  const int do_tx_search =
+      !((cpi->sf.inter_mode_rd_model_estimation == 1 && md->ready) ||
+        (cpi->sf.inter_mode_rd_model_estimation == 2 &&
+         x->source_variance < 512));
+  InterModesInfo *inter_modes_info = x->inter_modes_info;
+  inter_modes_info->num = 0;
+
+  int intra_mode_num = 0;
+  int intra_mode_idx_ls[MAX_MODES];
+  int reach_first_comp_mode = 0;
+
+  // Temporary buffers used by handle_inter_mode().
+  uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_obmc_bufs[0]);
+
+  CompoundTypeRdBuffers rd_buffers;
+  alloc_compound_type_rd_buffers(cm, &rd_buffers);
+
+  for (int midx = 0; midx < LAST_SINGLE_REF_MODES + 1; ++midx) {
+    const MODE_DEFINITION *mode_order = &av1_mode_order[midx];
+    this_mode = mode_order->mode;
+    const MV_REFERENCE_FRAME ref_frame = mode_order->ref_frame[0];
+    const MV_REFERENCE_FRAME second_ref_frame = mode_order->ref_frame[1];
+    const int comp_pred = second_ref_frame > INTRA_FRAME;
+
+    if (ref_frame > LAST_FRAME) continue;
+
+    // When single ref motion search ends:
+    // 1st pass: To evaluate single ref RD results and rewind to the beginning;
+    // 2nd pass: To continue with compound ref search.
+    if (sf->prune_single_motion_modes_by_simple_trans) {
+      if (comp_pred && args.single_ref_first_pass) {
+        args.single_ref_first_pass = 0;
+        // Reach the first comp ref mode
+        // Reset midx to start the 2nd pass for single ref motion search
+        midx = -1;
+        motion_mode_skip_mask = analyze_simple_trans_states(cpi, x);
+        continue;
+      }
+      if (!comp_pred) {  // single ref mode
+        if (args.single_ref_first_pass) {
+          // clear stats
+          for (int k = 0; k < MAX_REF_MV_SERCH; ++k) {
+            x->simple_rd_state[midx][k].rd_stats.rdcost = INT64_MAX;
+            x->simple_rd_state[midx][k].early_skipped = 0;
+          }
+        } else {
+          if (motion_mode_skip_mask & (1 << ref_frame)) {
+            continue;
+          }
+        }
+      }
+    }
+
+    // Reach the first compound prediction mode
+    if (sf->prune_comp_search_by_single_result > 0 && comp_pred &&
+        reach_first_comp_mode == 0) {
+      analyze_single_states(cpi, &search_state);
+      reach_first_comp_mode = 1;
+    }
+    int64_t this_rd = INT64_MAX;
+    int disable_skip = 0;
+    int rate2 = 0, rate_y = 0, rate_uv = 0;
+    int64_t distortion2 = 0;
+    int skippable = 0;
+    int this_skip2 = 0;
+
+    init_mbmi(mbmi, midx, cm);
+
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+    if (inter_mode_compatible_skip(cpi, x, bsize, midx)) continue;
+
+    const int ret = inter_mode_search_order_independent_skip(
+        cpi, ctx, x, bsize, midx, mi_row, mi_col, &mode_skip_mask,
+        &search_state);
+    if (ret == 1) continue;
+    args.skip_motion_mode = (ret == 2);
+
+    if (sf->drop_ref && comp_pred) {
+      if (sf_check_is_drop_ref(mode_order, &search_state)) {
+        continue;
+      }
+    }
+
+    if (search_state.best_rd < search_state.mode_threshold[midx]) continue;
+
+    if (sf->prune_comp_search_by_single_result > 0 && comp_pred) {
+      if (compound_skip_by_single_states(cpi, &search_state, this_mode,
+                                         ref_frame, second_ref_frame, x))
+        continue;
+    }
+
+    const int ref_frame_cost = comp_pred
+                                   ? ref_costs_comp[ref_frame][second_ref_frame]
+                                   : ref_costs_single[ref_frame];
+    const int compmode_cost =
+        is_comp_ref_allowed(mbmi->sb_type) ? comp_inter_cost[comp_pred] : 0;
+    const int real_compmode_cost =
+        cm->current_frame.reference_mode == REFERENCE_MODE_SELECT
+            ? compmode_cost
+            : 0;
+
+    if (comp_pred) {
+      if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+          search_state.best_mode_index >= 0 &&
+          search_state.best_mbmode.ref_frame[0] == INTRA_FRAME)
+        continue;
+    }
+
+    if (ref_frame == INTRA_FRAME) {
+      if (!cpi->oxcf.enable_smooth_intra &&
+          (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
+           mbmi->mode == SMOOTH_V_PRED))
+        continue;
+      if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) continue;
+      if (sf->adaptive_mode_search > 1)
+        if ((x->source_variance << num_pels_log2_lookup[bsize]) >
+            search_state.best_pred_sse)
+          continue;
+
+      if (this_mode != DC_PRED) {
+        // Only search the oblique modes if the best so far is
+        // one of the neighboring directional modes
+        if ((sf->mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+            (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) {
+          if (search_state.best_mode_index >= 0 &&
+              search_state.best_mbmode.ref_frame[0] > INTRA_FRAME)
+            continue;
+        }
+        if (sf->mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+          if (conditional_skipintra(this_mode, search_state.best_intra_mode))
+            continue;
+        }
+      }
+    }
+
+    // Select prediction reference frames.
+    for (i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+    }
+
+    if (ref_frame == INTRA_FRAME) {
+      intra_mode_idx_ls[intra_mode_num++] = midx;
+      continue;
+    } else {
+      mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+      mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+      mbmi->filter_intra_mode_info.use_filter_intra = 0;
+      mbmi->ref_mv_idx = 0;
+      int64_t ref_best_rd = search_state.best_rd;
+      {
+        RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
+        av1_init_rd_stats(&rd_stats);
+        rd_stats.rate = rate2;
+
+        // Point to variables that are maintained between loop iterations
+        args.single_newmv = search_state.single_newmv;
+        args.single_newmv_rate = search_state.single_newmv_rate;
+        args.single_newmv_valid = search_state.single_newmv_valid;
+        args.single_comp_cost = real_compmode_cost;
+        args.ref_frame_cost = ref_frame_cost;
+        if (midx < MAX_SINGLE_REF_MODES) {
+          args.simple_rd_state = x->simple_rd_state[midx];
+        }
+        this_rd = handle_inter_mode(
+            cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
+            &disable_skip, mi_row, mi_col, &args, ref_best_rd, tmp_buf,
+            &rd_buffers, &best_est_rd, do_tx_search, inter_modes_info);
+        rate2 = rd_stats.rate;
+        skippable = rd_stats.skip;
+        distortion2 = rd_stats.dist;
+        rate_y = rd_stats_y.rate;
+        rate_uv = rd_stats_uv.rate;
+#if CONFIG_ONE_PASS_SVM
+        av1_unpack_reg_stat(&rd_stats_y, &temp_y_eob, &temp_y_eob_0,
+                            &temp_y_eob_1, &temp_y_eob_2, &temp_y_eob_3,
+                            &temp_y_rd, &temp_y_rd_0, &temp_y_rd_1,
+                            &temp_y_rd_2, &temp_y_rd_3);
+#endif
+      }
+
+      if (sf->prune_comp_search_by_single_result > 0 &&
+          is_inter_singleref_mode(this_mode) && args.single_ref_first_pass) {
+        collect_single_states(x, &search_state, mbmi);
+      }
+
+      if (this_rd == INT64_MAX) continue;
+
+      this_skip2 = mbmi->skip;
+      this_rd = RDCOST(x->rdmult, rate2, distortion2);
+      if (this_skip2) {
+        rate_y = 0;
+        rate_uv = 0;
+      }
+    }
+
+    // Did this mode help.. i.e. is it the new best mode
+    if (this_rd < search_state.best_rd || x->skip) {
+      int mode_excluded = 0;
+      if (comp_pred) {
+        mode_excluded = cm->current_frame.reference_mode == SINGLE_REFERENCE;
+      }
+      if (!mode_excluded) {
+        // Note index of best mode so far
+        search_state.best_mode_index = midx;
+
+        if (ref_frame == INTRA_FRAME) {
+          /* required for left and above block mv */
+          mbmi->mv[0].as_int = 0;
+        } else {
+          search_state.best_pred_sse = x->pred_sse[ref_frame];
+        }
+
+        rd_cost->rate = rate2;
+        rd_cost->dist = distortion2;
+        rd_cost->rdcost = this_rd;
+        search_state.best_rd = this_rd;
+        search_state.best_mbmode = *mbmi;
+        search_state.best_skip2 = this_skip2;
+        search_state.best_mode_skippable = skippable;
+        if (do_tx_search) {
+          // When do_tx_search == 0, handle_inter_mode won't provide correct
+          // rate_y and rate_uv because txfm_search process is replaced by
+          // rd estimation.
+          // Therfore, we should avoid updating best_rate_y and best_rate_uv
+          // here. These two values will be updated when txfm_search is called
+          search_state.best_rate_y =
+              rate_y +
+              x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable];
+          search_state.best_rate_uv = rate_uv;
+
+#if CONFIG_ONE_PASS_SVM
+          av1_set_reg_stat(rd_cost, temp_y_eob, temp_y_eob_0, temp_y_eob_1,
+                           temp_y_eob_2, temp_y_eob_3, temp_y_rd, temp_y_rd_0,
+                           temp_y_rd_1, temp_y_rd_2, temp_y_rd_3);
+#endif
+        }
+        memcpy(ctx->blk_skip, x->blk_skip,
+               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      }
+    }
+
+    /* keep record of best compound/single-only prediction */
+    if (!disable_skip && ref_frame != INTRA_FRAME) {
+      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+      if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+        single_rate = rate2 - compmode_cost;
+        hybrid_rate = rate2;
+      } else {
+        single_rate = rate2;
+        hybrid_rate = rate2 + compmode_cost;
+      }
+
+      single_rd = RDCOST(x->rdmult, single_rate, distortion2);
+      hybrid_rd = RDCOST(x->rdmult, hybrid_rate, distortion2);
+
+      if (!comp_pred) {
+        if (single_rd < search_state.best_pred_rd[SINGLE_REFERENCE])
+          search_state.best_pred_rd[SINGLE_REFERENCE] = single_rd;
+      } else {
+        if (single_rd < search_state.best_pred_rd[COMPOUND_REFERENCE])
+          search_state.best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+      }
+      if (hybrid_rd < search_state.best_pred_rd[REFERENCE_MODE_SELECT])
+        search_state.best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+    }
+    if (sf->drop_ref && second_ref_frame == NONE_FRAME) {
+      // Collect data from single ref mode, and analyze data.
+      sf_drop_ref_analyze(&search_state, mode_order, distortion2);
+    }
+
+    if (x->skip && !comp_pred) break;
+  }
+
+  release_compound_type_rd_buffers(&rd_buffers);
+
+  if (!do_tx_search) {
+    inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
+    search_state.best_rd = INT64_MAX;
+
+    int64_t top_est_rd =
+        inter_modes_info->num > 0
+            ? inter_modes_info
+                  ->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx]
+            : INT64_MAX;
+    for (int j = 0; j < inter_modes_info->num; ++j) {
+      const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx;
+      *mbmi = inter_modes_info->mbmi_arr[data_idx];
+      int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx];
+      if (curr_est_rd * 0.80 > top_est_rd) break;
+
+      const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
+
+      x->skip = 0;
+      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+      // Select prediction reference frames.
+      const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+      for (i = 0; i < num_planes; i++) {
+        xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+        if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+      }
+
+      RD_STATS rd_stats;
+      RD_STATS rd_stats_y;
+      RD_STATS rd_stats_uv;
+
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                    av1_num_planes(cm) - 1);
+      if (mbmi->motion_mode == OBMC_CAUSAL)
+        av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+
+      if (!txfm_search(cpi, tile_data, x, bsize, mi_row, mi_col, &rd_stats,
+                       &rd_stats_y, &rd_stats_uv, mode_rate,
+                       search_state.best_rd)) {
+        continue;
+      } else if (cpi->sf.inter_mode_rd_model_estimation == 1) {
+        const int skip_ctx = av1_get_skip_context(xd);
+        inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse,
+                             rd_stats.dist,
+                             rd_stats_y.rate + rd_stats_uv.rate +
+                                 x->skip_cost[skip_ctx][mbmi->skip]);
+      }
+      rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+
+      if (rd_stats.rdcost < search_state.best_rd) {
+        search_state.best_rd = rd_stats.rdcost;
+        // Note index of best mode so far
+        const int mode_index = get_prediction_mode_idx(
+            mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+        search_state.best_mode_index = mode_index;
+        *rd_cost = rd_stats;
+        search_state.best_rd = rd_stats.rdcost;
+        search_state.best_mbmode = *mbmi;
+        search_state.best_skip2 = mbmi->skip;
+        search_state.best_mode_skippable = rd_stats.skip;
+        search_state.best_rate_y =
+            rd_stats_y.rate +
+            x->skip_cost[av1_get_skip_context(xd)][rd_stats.skip || mbmi->skip];
+        search_state.best_rate_uv = rd_stats_uv.rate;
+        memcpy(ctx->blk_skip, x->blk_skip,
+               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+#if CONFIG_ONE_PASS_SVM
+        av1_copy_reg_stat(rd_cost, &rd_stats_y);
+#endif
+      }
+    }
+  }
+
+  for (int j = 0; j < intra_mode_num; ++j) {
+    const int mode_index = intra_mode_idx_ls[j];
+    const MV_REFERENCE_FRAME ref_frame =
+        av1_mode_order[mode_index].ref_frame[0];
+    assert(av1_mode_order[mode_index].ref_frame[1] == NONE_FRAME);
+    assert(ref_frame == INTRA_FRAME);
+    if (sf->skip_intra_in_interframe && search_state.skip_intra_modes) break;
+    init_mbmi(mbmi, mode_index, cm);
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, INTRA_FRAME, NONE_FRAME);
+
+    // Select prediction reference frames.
+    for (i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+    }
+
+    RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv;
+
+    const int ref_frame_cost = ref_costs_single[ref_frame];
+    intra_rd_stats.rdcost = handle_intra_mode(
+        &search_state, cpi, x, bsize, mi_row, mi_col, ref_frame_cost, ctx, 0,
+        &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv);
+    if (intra_rd_stats.rdcost < search_state.best_rd) {
+      search_state.best_rd = intra_rd_stats.rdcost;
+      // Note index of best mode so far
+      search_state.best_mode_index = mode_index;
+      *rd_cost = intra_rd_stats;
+      search_state.best_rd = intra_rd_stats.rdcost;
+      search_state.best_mbmode = *mbmi;
+      search_state.best_skip2 = 0;
+      search_state.best_mode_skippable = intra_rd_stats.skip;
+      search_state.best_rate_y =
+          intra_rd_stats_y.rate +
+          x->skip_cost[av1_get_skip_context(xd)][intra_rd_stats.skip];
+      search_state.best_rate_uv = intra_rd_stats_uv.rate;
+      memcpy(ctx->blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+#if CONFIG_ONE_PASS_SVM
+      av1_copy_reg_stat(rd_cost, &intra_rd_stats_y);
+#endif
+    }
+  }
+
+  search_state.best_mbmode.skip_mode = 0;
+  if (cm->current_frame.skip_mode_info.skip_mode_flag &&
+      !segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+      is_comp_ref_allowed(bsize)) {
+    rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, mi_row, mi_col,
+                      yv12_mb);
+  }
+
+  // Make sure that the ref_mv_idx is only nonzero when we're
+  // using a mode which can support ref_mv_idx
+  if (search_state.best_mbmode.ref_mv_idx != 0 &&
+      !(search_state.best_mbmode.mode == NEWMV ||
+        search_state.best_mbmode.mode == NEW_NEWMV ||
+        have_nearmv_in_inter_mode(search_state.best_mbmode.mode))) {
+    search_state.best_mbmode.ref_mv_idx = 0;
+  }
+
+  if (search_state.best_mode_index < 0 ||
+      search_state.best_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+  assert(
+      (cm->interp_filter == SWITCHABLE) ||
+      (cm->interp_filter ==
+       av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 0)) ||
+      !is_inter_block(&search_state.best_mbmode));
+  assert(
+      (cm->interp_filter == SWITCHABLE) ||
+      (cm->interp_filter ==
+       av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 1)) ||
+      !is_inter_block(&search_state.best_mbmode));
+
+  if (!cpi->rc.is_src_frame_alt_ref)
+    av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+                              sf->adaptive_rd_thresh, bsize,
+                              search_state.best_mode_index);
+
+  // macroblock modes
+  *mbmi = search_state.best_mbmode;
+  x->skip |= search_state.best_skip2;
+
+  // Note: this section is needed since the mode may have been forced to
+  // GLOBALMV by the all-zero mode handling of ref-mv.
+  if (mbmi->mode == GLOBALMV || mbmi->mode == GLOBAL_GLOBALMV) {
+    // Correct the interp filters for GLOBALMV
+    if (is_nontrans_global_motion(xd, xd->mi[0])) {
+      assert(mbmi->interp_filters ==
+             av1_broadcast_interp_filter(
+                 av1_unswitchable_filter(cm->interp_filter)));
+    }
+  }
+
+  for (i = 0; i < REFERENCE_MODES; ++i) {
+    if (search_state.best_pred_rd[i] == INT64_MAX)
+      search_state.best_pred_diff[i] = INT_MIN;
+    else
+      search_state.best_pred_diff[i] =
+          search_state.best_rd - search_state.best_pred_rd[i];
+  }
+
+  x->skip |= search_state.best_mode_skippable;
+
+  assert(search_state.best_mode_index >= 0);
+
+  store_coding_context(x, ctx, search_state.best_mode_index,
+                       search_state.best_pred_diff,
+                       search_state.best_mode_skippable);
+}
+
 void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
                                         TileDataEnc *tile_data, MACROBLOCK *x,
                                         int mi_row, int mi_col,

diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index 0ce931a..7ba1b18 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h

@@ -123,6 +123,13 @@
                                struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
 
+void av1_nonrd_pick_inter_mode_sb(struct AV1_COMP *cpi,
+                                  struct TileDataEnc *tile_data,
+                                  struct macroblock *x, int mi_row, int mi_col,
+                                  struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                  PICK_MODE_CONTEXT *ctx,
+                                  int64_t best_rd_so_far);
+
 void av1_rd_pick_inter_mode_sb_seg_skip(
     const struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
     struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost,

diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index bbb6842..5237b03 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c

@@ -237,6 +237,7 @@
   sf->disable_wedge_search_edge_thresh = 0;
   sf->prune_motion_mode_level = 1;
   sf->cb_pred_filter_search = 0;
+  sf->use_nonrd_pick_mode = 0;
 
   if (speed >= 1) {
     sf->gm_erroradv_type = GM_ERRORADV_TR_1;
@@ -407,6 +408,211 @@
   }
 }
 
+// TODO(kyslov): now this is very similar to
+// set_good_speed_features_framesize_independent
+//               except it sets non-rd flag on speed8. This function will likely
+//               be modified in the future with RT-specific speed features
+static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
+                                                        SPEED_FEATURES *sf,
+                                                        int speed) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int boosted = frame_is_boosted(cpi);
+
+  // Speed 0 for all speed features that give neutral coding performance change.
+  sf->reduce_inter_modes = 1;
+  sf->prune_ext_partition_types_search_level = 1;
+  sf->ml_prune_rect_partition = 1;
+  sf->ml_prune_ab_partition = 1;
+  sf->ml_prune_4_partition = 1;
+  sf->adaptive_txb_search_level = 1;
+  sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
+  sf->model_based_prune_tx_search_level = 1;
+  sf->model_based_post_interp_filter_breakout = 1;
+  sf->model_based_motion_mode_rd_breakout = 1;
+
+  // TODO(debargha): Test, tweak and turn on either 1 or 2
+  sf->inter_mode_rd_model_estimation = 0;
+
+  sf->prune_ref_frame_for_rect_partitions =
+      !(boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame);
+  sf->prune_ref_mode_for_partitions = sf->prune_ref_frame_for_rect_partitions;
+  sf->less_rectangular_check_level = 1;
+  sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3;
+  sf->gm_disable_recode = 1;
+  sf->use_fast_interpolation_filter_search = 1;
+  sf->intra_tx_size_search_init_depth_sqr = 1;
+  sf->intra_angle_estimation = 1;
+  sf->selective_ref_frame = 1;
+  sf->prune_wedge_pred_diff_based = 1;
+  sf->disable_wedge_search_var_thresh = 0;
+  sf->disable_wedge_search_edge_thresh = 0;
+  sf->prune_motion_mode_level = 1;
+  sf->cb_pred_filter_search = 0;
+  sf->use_nonrd_pick_mode = 0;
+
+  if (speed >= 1) {
+    sf->gm_erroradv_type = GM_ERRORADV_TR_1;
+    sf->selective_ref_frame = 2;
+
+    sf->inter_tx_size_search_init_depth_rect = 1;
+    sf->inter_tx_size_search_init_depth_sqr = 1;
+    sf->intra_tx_size_search_init_depth_rect = 1;
+    sf->tx_size_search_lgr_block = 1;
+    sf->prune_ext_partition_types_search_level = 2;
+    sf->skip_repeat_interpolation_filter_search = 1;
+    sf->tx_type_search.skip_tx_search = 1;
+    sf->tx_type_search.ml_tx_split_thresh = 40;
+    sf->model_based_prune_tx_search_level = 0;
+    sf->adaptive_txb_search_level = 2;
+    sf->use_intra_txb_hash = 1;
+    sf->optimize_b_precheck = 1;
+    sf->dual_sgr_penalty_level = 1;
+    sf->use_accurate_subpel_search = USE_4_TAPS;
+    sf->reuse_inter_intra_mode = 1;
+    sf->prune_comp_search_by_single_result = 1;
+    sf->skip_repeated_newmv = 1;
+    sf->obmc_full_pixel_search_level = 1;
+    // TODO(anyone): Following speed feature will be further explored to
+    // identify the appropriate tradeoff between encoder performance and its
+    // speed.
+    sf->prune_single_motion_modes_by_simple_trans = 1;
+
+    sf->simple_motion_search_prune_rect = 1;
+
+    sf->disable_wedge_search_var_thresh = 0;
+    sf->disable_wedge_search_edge_thresh = 0;
+    sf->prune_comp_type_by_comp_avg = 1;
+    sf->prune_motion_mode_level = 2;
+    sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
+    sf->cb_pred_filter_search = 1;
+    sf->use_transform_domain_distortion = boosted ? 0 : 1;
+  }
+
+  if (speed >= 2) {
+    sf->gm_erroradv_type = GM_ERRORADV_TR_2;
+
+    sf->selective_ref_frame = 3;
+    sf->fast_cdef_search = 1;
+
+    sf->adaptive_rd_thresh = 1;
+    sf->mv.auto_mv_step_size = 1;
+    sf->mv.subpel_iters_per_step = 1;
+    sf->disable_filter_search_var_thresh = 100;
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
+
+    sf->partition_search_breakout_rate_thr = 80;
+    // Note: This speed feature is disable as it seems to be worse in
+    // compression/quality and is also slower.
+    // sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+    sf->allow_partition_search_skip = 1;
+    sf->disable_wedge_search_var_thresh = 100;
+    sf->disable_wedge_search_edge_thresh = 0;
+    sf->fast_wedge_sign_estimate = 1;
+    sf->disable_dual_filter = 1;
+    sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
+    sf->prune_comp_type_by_comp_avg = 2;
+    sf->cb_pred_filter_search = 0;
+    sf->adaptive_interp_filter_search = 1;
+  }
+
+  if (speed >= 3) {
+    sf->selective_ref_frame = 4;
+    sf->tx_size_search_method = boosted ? USE_FULL_RD : USE_LARGESTALL;
+    sf->less_rectangular_check_level = 2;
+    sf->adaptive_pred_interp_filter = 1;
+    // adaptive_motion_search breaks encoder multi-thread tests.
+    // The values in x->pred_mv[] differ for single and multi-thread cases.
+    // See aomedia:1778.
+    // sf->adaptive_motion_search = 1;
+    sf->recode_loop = ALLOW_RECODE_KFARFGF;
+    sf->use_transform_domain_distortion = 1;
+    sf->use_accurate_subpel_search = USE_2_TAPS;
+    sf->adaptive_rd_thresh = 2;
+    sf->tx_type_search.prune_mode = PRUNE_2D_FAST;
+    sf->gm_search_type = GM_DISABLE_SEARCH;
+    sf->prune_comp_search_by_single_result = 2;
+    sf->prune_motion_mode_level = boosted ? 2 : 3;
+    sf->prune_warp_using_wmtype = 1;
+    // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
+    // it with cpi->sf.disable_wedge_search_var_thresh.
+    sf->disable_wedge_interintra_search = 1;
+  }
+
+  if (speed >= 4) {
+    sf->use_intra_txb_hash = 0;
+    sf->use_mb_rd_hash = 0;
+    sf->tx_type_search.fast_intra_tx_type_search = 1;
+    sf->tx_type_search.fast_inter_tx_type_search = 1;
+    sf->use_square_partition_only_threshold =
+        boosted ? BLOCK_128X128 : BLOCK_4X4;
+    sf->tx_size_search_method =
+        frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
+    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
+    sf->adaptive_pred_interp_filter = 0;
+    sf->adaptive_mode_search = 1;
+    sf->cb_partition_search = !boosted;
+    sf->alt_ref_search_fp = 1;
+    sf->skip_sharp_interp_filter_search = 1;
+  }
+
+  if (speed >= 5) {
+    sf->recode_loop = ALLOW_RECODE_KFMAXBW;
+    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
+    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
+    sf->use_square_partition_only_threshold = BLOCK_4X4;
+    sf->tx_size_search_method = USE_LARGESTALL;
+    sf->mv.search_method = BIGDIA;
+    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+    sf->adaptive_rd_thresh = 4;
+    sf->mode_search_skip_flags =
+        (cm->current_frame.frame_type == KEY_FRAME)
+            ? 0
+            : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+                  FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
+                  FLAG_EARLY_TERMINATE;
+    sf->disable_filter_search_var_thresh = 200;
+    sf->use_fast_coef_costing = 1;
+    sf->partition_search_breakout_rate_thr = 300;
+    sf->use_transform_domain_distortion = 2;
+  }
+
+  if (speed >= 6) {
+    int i;
+    sf->optimize_coefficients = NO_TRELLIS_OPT;
+    sf->mv.search_method = HEX;
+    sf->disable_filter_search_var_thresh = 500;
+    for (i = 0; i < TX_SIZES; ++i) {
+      sf->intra_y_mode_mask[i] = INTRA_DC;
+      sf->intra_uv_mode_mask[i] = UV_INTRA_DC_CFL;
+    }
+    sf->partition_search_breakout_rate_thr = 500;
+    sf->mv.reduce_first_step_size = 1;
+    sf->simple_model_rd_from_var = 1;
+  }
+  if (speed >= 7) {
+    sf->default_max_partition_size = BLOCK_32X32;
+    sf->default_min_partition_size = BLOCK_8X8;
+    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+    sf->frame_parameter_update = 0;
+    sf->mv.search_method = FAST_HEX;
+    sf->partition_search_type = REFERENCE_PARTITION;
+    sf->mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+  }
+  if (speed >= 8) {
+    sf->mv.search_method = FAST_DIAMOND;
+    sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
+    sf->default_max_partition_size = BLOCK_128X128;
+    sf->default_min_partition_size = BLOCK_8X8;
+    sf->partition_search_type = VAR_BASED_PARTITION;
+    sf->use_nonrd_pick_mode = 1;
+  }
+}
+
 void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
   SPEED_FEATURES *const sf = &cpi->sf;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
@@ -570,12 +776,13 @@
   sf->disable_interinter_wedge_newmv_search = 0;
   sf->prune_motion_mode_level = 0;
   sf->prune_warp_using_wmtype = 0;
-
   sf->disable_wedge_interintra_search = 0;
   sf->perform_coeff_opt = 0;
 
   if (oxcf->mode == GOOD)
     set_good_speed_features_framesize_independent(cpi, sf, speed);
+  else if (oxcf->mode == REALTIME)
+    set_rt_speed_features_framesize_independent(cpi, sf, speed);
 
   if (!cpi->seq_params_locked) {
     cpi->common.seq_params.enable_dual_filter &= !sf->disable_dual_filter;

diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 2216c7b..b055904 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h

@@ -191,7 +191,9 @@
   // Always use a fixed size partition
   FIXED_PARTITION,
 
-  REFERENCE_PARTITION
+  REFERENCE_PARTITION,
+
+  VAR_BASED_PARTITION
 } UENUM1BYTE(PARTITION_SEARCH_TYPE);
 
 enum {
@@ -652,6 +654,9 @@
 
   // Flag used to control the extent of coeff R-D optimization
   int perform_coeff_opt;
+
+  // This flag controls the use of non-RD mode decision.
+  int use_nonrd_pick_mode;
 } SPEED_FEATURES;
 
 struct AV1_COMP;

diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
new file mode 100644
index 0000000..7f19344
--- /dev/null
+++ b/av1/encoder/var_based_part.c

@@ -0,0 +1,778 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/reconinter.h"
+#include "av1/common/blockd.h"
+
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/var_based_part.h"
+#include "av1/encoder/reconinter_enc.h"
+
+extern const uint8_t AV1_VAR_OFFS[];
+
+typedef struct {
+  // TODO(kyslov): consider changing to 64bit
+
+  // This struct is used for computing variance in choose_partitioning(), where
+  // the max number of samples within a superblock is 32x32 (with 4x4 avg).
+  // With 8bit bitdepth, uint32_t is enough for sum_square_error (2^8 * 2^8 * 32
+  // * 32 = 2^26). For high bitdepth we need to consider changing this to 64 bit
+  uint32_t sum_square_error;
+  int32_t sum_error;
+  int log2_count;
+  int variance;
+} var;
+
+typedef struct {
+  var none;
+  var horz[2];
+  var vert[2];
+} partition_variance;
+
+typedef struct {
+  partition_variance part_variances;
+  var split[4];
+} v4x4;
+
+typedef struct {
+  partition_variance part_variances;
+  v4x4 split[4];
+} v8x8;
+
+typedef struct {
+  partition_variance part_variances;
+  v8x8 split[4];
+} v16x16;
+
+typedef struct {
+  partition_variance part_variances;
+  v16x16 split[4];
+} v32x32;
+
+typedef struct {
+  partition_variance part_variances;
+  v32x32 split[4];
+} v64x64;
+
+typedef struct {
+  partition_variance part_variances;
+  v64x64 split[4];
+} v128x128;
+
+typedef struct {
+  partition_variance *part_variances;
+  var *split[4];
+} variance_node;
+
+static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
+  int i;
+  node->part_variances = NULL;
+  switch (bsize) {
+    case BLOCK_128X128: {
+      v128x128 *vt = (v128x128 *)data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_64X64: {
+      v64x64 *vt = (v64x64 *)data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_32X32: {
+      v32x32 *vt = (v32x32 *)data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_16X16: {
+      v16x16 *vt = (v16x16 *)data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_8X8: {
+      v8x8 *vt = (v8x8 *)data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    default: {
+      v4x4 *vt = (v4x4 *)data;
+      assert(bsize == BLOCK_4X4);
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++) node->split[i] = &vt->split[i];
+      break;
+    }
+  }
+}
+
+// Set variance values given sum square error, sum error, count.
+static void fill_variance(uint32_t s2, int32_t s, int c, var *v) {
+  v->sum_square_error = s2;
+  v->sum_error = s;
+  v->log2_count = c;
+}
+
+static void get_variance(var *v) {
+  v->variance =
+      (int)(256 * (v->sum_square_error -
+                   (uint32_t)(((int64_t)v->sum_error * v->sum_error) >>
+                              v->log2_count)) >>
+            v->log2_count);
+}
+
+static void sum_2_variances(const var *a, const var *b, var *r) {
+  assert(a->log2_count == b->log2_count);
+  fill_variance(a->sum_square_error + b->sum_square_error,
+                a->sum_error + b->sum_error, a->log2_count + 1, r);
+}
+
+static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
+  variance_node node;
+  memset(&node, 0, sizeof(node));
+  tree_to_node(data, bsize, &node);
+  sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]);
+  sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]);
+  sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]);
+  sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]);
+  sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1],
+                  &node.part_variances->none);
+}
+
+static void set_block_size(AV1_COMP *const cpi, MACROBLOCK *const x,
+                           MACROBLOCKD *const xd, int mi_row, int mi_col,
+                           BLOCK_SIZE bsize) {
+  if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) {
+    set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+    xd->mi[0]->sb_type = bsize;
+  }
+}
+
+static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCK *const x,
+                               MACROBLOCKD *const xd, void *data,
+                               BLOCK_SIZE bsize, int mi_row, int mi_col,
+                               int64_t threshold, BLOCK_SIZE bsize_min,
+                               int force_split) {
+  AV1_COMMON *const cm = &cpi->common;
+  variance_node vt;
+  const int block_width = mi_size_wide[bsize];
+  const int block_height = mi_size_high[bsize];
+
+  assert(block_height == block_width);
+  tree_to_node(data, bsize, &vt);
+
+  if (force_split == 1) return 0;
+
+  // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
+  // variance is below threshold, otherwise split will be selected.
+  // No check for vert/horiz split as too few samples for variance.
+  if (bsize == bsize_min) {
+    // Variance already computed to set the force_split.
+    if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
+    if (mi_col + block_width / 2 < cm->mi_cols &&
+        mi_row + block_height / 2 < cm->mi_rows &&
+        vt.part_variances->none.variance < threshold) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+      return 1;
+    }
+    return 0;
+  } else if (bsize > bsize_min) {
+    // Variance already computed to set the force_split.
+    if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
+    // For key frame: take split for bsize above 32X32 or very high variance.
+    if (frame_is_intra_only(cm) &&
+        (bsize > BLOCK_32X32 ||
+         vt.part_variances->none.variance > (threshold << 4))) {
+      return 0;
+    }
+    // If variance is low, take the bsize (no split).
+    if (mi_col + block_width / 2 < cm->mi_cols &&
+        mi_row + block_height / 2 < cm->mi_rows &&
+        vt.part_variances->none.variance < threshold) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+      return 1;
+    }
+
+    // Check vertical split.
+    if (mi_row + block_height / 2 < cm->mi_rows) {
+      BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT);
+      get_variance(&vt.part_variances->vert[0]);
+      get_variance(&vt.part_variances->vert[1]);
+      if (vt.part_variances->vert[0].variance < threshold &&
+          vt.part_variances->vert[1].variance < threshold &&
+          get_plane_block_size(subsize, xd->plane[1].subsampling_x,
+                               xd->plane[1].subsampling_y) < BLOCK_INVALID) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row, mi_col + block_width / 2, subsize);
+        return 1;
+      }
+    }
+    // Check horizontal split.
+    if (mi_col + block_width / 2 < cm->mi_cols) {
+      BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+      get_variance(&vt.part_variances->horz[0]);
+      get_variance(&vt.part_variances->horz[1]);
+      if (vt.part_variances->horz[0].variance < threshold &&
+          vt.part_variances->horz[1].variance < threshold &&
+          get_plane_block_size(subsize, xd->plane[1].subsampling_x,
+                               xd->plane[1].subsampling_y) < BLOCK_INVALID) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row + block_height / 2, mi_col, subsize);
+        return 1;
+      }
+    }
+
+    return 0;
+  }
+  return 0;
+}
+
+static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
+                                 int dp, int x16_idx, int y16_idx, v16x16 *vst,
+                                 int pixels_wide, int pixels_high,
+                                 int is_key_frame) {
+  int k;
+  for (k = 0; k < 4; k++) {
+    int x8_idx = x16_idx + ((k & 1) << 3);
+    int y8_idx = y16_idx + ((k >> 1) << 3);
+    unsigned int sse = 0;
+    int sum = 0;
+    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+      int s_avg;
+      int d_avg = 128;
+      s_avg = aom_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+      if (!is_key_frame) d_avg = aom_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+
+      sum = s_avg - d_avg;
+      sse = sum * sum;
+    }
+    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+  }
+}
+
+static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
+                              int dp, int x16_idx, int y16_idx, int pixels_wide,
+                              int pixels_high) {
+  int k;
+  int minmax_max = 0;
+  int minmax_min = 255;
+  // Loop over the 4 8x8 subblocks.
+  for (k = 0; k < 4; k++) {
+    int x8_idx = x16_idx + ((k & 1) << 3);
+    int y8_idx = y16_idx + ((k >> 1) << 3);
+    int min = 0;
+    int max = 0;
+    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+      aom_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx, dp,
+                     &min, &max);
+      if ((max - min) > minmax_max) minmax_max = (max - min);
+      if ((max - min) < minmax_min) minmax_min = (max - min);
+    }
+  }
+  return (minmax_max - minmax_min);
+}
+
+static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
+                                 int dp, int x8_idx, int y8_idx, v8x8 *vst,
+                                 int pixels_wide, int pixels_high,
+                                 int is_key_frame) {
+  int k;
+  for (k = 0; k < 4; k++) {
+    int x4_idx = x8_idx + ((k & 1) << 2);
+    int y4_idx = y8_idx + ((k >> 1) << 2);
+    unsigned int sse = 0;
+    int sum = 0;
+    if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+      int s_avg;
+      int d_avg = 128;
+      s_avg = aom_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+      if (!is_key_frame) d_avg = aom_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+      sum = s_avg - d_avg;
+      sse = sum * sum;
+    }
+    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+  }
+}
+
+static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed,
+                                         int width, int height,
+                                         int content_state) {
+  if (speed >= 8) {
+    if (width <= 640 && height <= 480)
+      return (5 * threshold_base) >> 2;
+    else if ((content_state == kLowSadLowSumdiff) ||
+             (content_state == kHighSadLowSumdiff) ||
+             (content_state == kLowVarHighSumdiff))
+      return (5 * threshold_base) >> 2;
+  } else if (speed == 7) {
+    if ((content_state == kLowSadLowSumdiff) ||
+        (content_state == kHighSadLowSumdiff) ||
+        (content_state == kLowVarHighSumdiff)) {
+      return (5 * threshold_base) >> 2;
+    }
+  }
+  return threshold_base;
+}
+
+// Set the variance split thresholds for following the block sizes:
+// 0 - threshold_128x128, 1 - threshold_64x64, 2 - threshold_32x32,
+// 3 - vbp_threshold_16x16. 4 - vbp_threshold_8x8 (to split to 4x4 partition) is
+// currently only used on key frame.
+static void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[], int q,
+                               int content_state) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int is_key_frame = frame_is_intra_only(cm);
+  const int threshold_multiplier = is_key_frame ? 40 : 1;
+  int64_t threshold_base =
+      (int64_t)(threshold_multiplier * cpi->dequants.y_dequant_QTX[q][1]);
+
+  if (is_key_frame) {
+    thresholds[0] = threshold_base;
+    thresholds[1] = threshold_base;
+    thresholds[2] = threshold_base >> 2;
+    thresholds[3] = threshold_base >> 2;
+    thresholds[4] = threshold_base << 2;
+  } else {
+    // Increase base variance threshold based on content_state/sum_diff level.
+    threshold_base = scale_part_thresh_sumdiff(
+        threshold_base, cpi->oxcf.speed, cm->width, cm->height, content_state);
+
+    thresholds[1] = threshold_base;
+    thresholds[3] = threshold_base << cpi->oxcf.speed;
+    if (cm->width >= 1280 && cm->height >= 720)
+      thresholds[3] = thresholds[3] << 1;
+    if (cm->width <= 352 && cm->height <= 288) {
+      thresholds[1] = threshold_base >> 3;
+      thresholds[2] = threshold_base >> 1;
+      thresholds[3] = threshold_base << 3;
+    } else if (cm->width < 1280 && cm->height < 720) {
+      thresholds[2] = (5 * threshold_base) >> 2;
+    } else if (cm->width < 1920 && cm->height < 1080) {
+      thresholds[2] = threshold_base << 1;
+      thresholds[3] <<= 2;
+    } else {
+      thresholds[2] = (5 * threshold_base) >> 1;
+    }
+  }
+}
+
+void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
+                                           int content_state) {
+  AV1_COMMON *const cm = &cpi->common;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  const int is_key_frame = frame_is_intra_only(cm);
+  if (sf->partition_search_type != VAR_BASED_PARTITION) {
+    return;
+  } else {
+    set_vbp_thresholds(cpi, cpi->vbp_thresholds, q, content_state);
+    // The thresholds below are not changed locally.
+    if (is_key_frame) {
+      cpi->vbp_threshold_sad = 0;
+      cpi->vbp_threshold_copy = 0;
+      cpi->vbp_bsize_min = BLOCK_8X8;
+    } else {
+      if (cm->width <= 352 && cm->height <= 288)
+        cpi->vbp_threshold_sad = 10;
+      else
+        cpi->vbp_threshold_sad = (cpi->dequants.y_dequant_QTX[q][1] << 1) > 1000
+                                     ? (cpi->dequants.y_dequant_QTX[q][1] << 1)
+                                     : 1000;
+      cpi->vbp_bsize_min = BLOCK_16X16;
+      if (cm->width <= 352 && cm->height <= 288)
+        cpi->vbp_threshold_copy = 4000;
+      else if (cm->width <= 640 && cm->height <= 360)
+        cpi->vbp_threshold_copy = 8000;
+      else
+        cpi->vbp_threshold_copy =
+            (cpi->dequants.y_dequant_QTX[q][1] << 3) > 8000
+                ? (cpi->dequants.y_dequant_QTX[q][1] << 3)
+                : 8000;
+    }
+    cpi->vbp_threshold_minmax = 15 + (q >> 3);
+  }
+}
+
+// This function chooses partitioning based on the variance between source and
+// reconstructed last, where variance is computed for down-sampled inputs.
+// TODO(kyslov): lot of things. Bring back noise estimation, brush up partition
+// selection
+//               and most of all - retune the thresholds
+int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+                                      MACROBLOCK *x, int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  int i, j, k, m;
+  v128x128 *vt;
+  v16x16 *vt2 = NULL;
+  unsigned char force_split[85];
+  int avg_32x32;
+  int max_var_32x32 = 0;
+  int min_var_32x32 = INT_MAX;
+  int var_32x32;
+  int var_64x64;
+  int min_var_64x64 = INT_MAX;
+  int max_var_64x64 = 0;
+  int avg_16x16[4];
+  int maxvar_16x16[4];
+  int minvar_16x16[4];
+  int64_t threshold_4x4avg;
+  int content_state = 0;
+  uint8_t *s;
+  const uint8_t *d;
+  int sp;
+  int dp;
+  int compute_minmax_variance = 1;
+  int is_key_frame = frame_is_intra_only(cm);
+  int pixels_wide = 128, pixels_high = 128;
+
+  CHECK_MEM_ERROR(cm, vt, aom_calloc(1, sizeof(*vt)));
+
+  int64_t thresholds[5] = { cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
+                            cpi->vbp_thresholds[2], cpi->vbp_thresholds[3],
+                            cpi->vbp_thresholds[4] };
+
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
+  int variance4x4downsample[64];
+  int segment_id;
+  const int num_planes = av1_num_planes(cm);
+
+  segment_id = xd->mi[0]->segment_id;
+
+  set_vbp_thresholds(cpi, thresholds, cm->base_qindex, content_state);
+
+  // For non keyframes, disable 4x4 average for low resolution when speed = 8
+  threshold_4x4avg = INT64_MAX;
+
+  if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
+  if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
+
+  s = x->plane[0].src.buf;
+  sp = x->plane[0].src.stride;
+
+  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
+  // 5-20 for the 16x16 blocks.
+  force_split[0] = 0;
+
+  if (!is_key_frame) {
+    // TODO(kyslov): we are assuming that the ref is LAST_FRAME! Check if it
+    // is!!
+    MB_MODE_INFO *mi = xd->mi[0];
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+
+    assert(yv12 != NULL);
+
+    av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                         get_ref_scale_factors(cm, LAST_FRAME), num_planes);
+    mi->ref_frame[0] = LAST_FRAME;
+    mi->ref_frame[1] = NONE_FRAME;
+    mi->sb_type = BLOCK_128X128;
+    mi->mv[0].as_int = 0;
+    mi->interp_filters = av1_make_interp_filters(BILINEAR, BILINEAR);
+
+// TODO(kyslov): bring the small SAD functionality back
+#if 0
+    y_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, x->plane[0].src.stride,
+                                   xd->plane[0].pre[0].buf,
+                                   xd->plane[0].pre[0].stride);
+#endif
+    x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
+
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, BLOCK_128X128,
+                                  AOM_PLANE_Y, AOM_PLANE_Y);
+
+    d = xd->plane[0].dst.buf;
+    dp = xd->plane[0].dst.stride;
+
+    // If the y_sad is very small, take 64x64 as partition and exit.
+    // Don't check on boosted segment for now, as 64x64 is suppressed there.
+#if 0
+        if (segment_id == CR_SEGMENT_ID_BASE && y_sad < cpi->vbp_threshold_sad)
+       { const int block_width = num_8x8_blocks_wide_lookup[BLOCK_64X64]; const
+       int block_height = num_8x8_blocks_high_lookup[BLOCK_64X64]; if (mi_col +
+       block_width / 2 < cm->mi_cols && mi_row + block_height / 2 < cm->mi_rows)
+       { set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_128X128);
+            x->variance_low[0] = 1;
+            return 0;
+          }
+        }
+#endif
+  } else {
+    d = AV1_VAR_OFFS;
+    dp = 0;
+  }
+
+  if (low_res && threshold_4x4avg < INT64_MAX)
+    CHECK_MEM_ERROR(cm, vt2, aom_calloc(64, sizeof(*vt2)));
+  // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
+  // for splits.
+  for (m = 0; m < 4; m++) {
+    const int x64_idx = ((m & 1) << 6);
+    const int y64_idx = ((m >> 1) << 6);
+    const int m2 = m << 2;
+    force_split[m + 1] = 0;
+    for (i = 0; i < 4; i++) {
+      const int x32_idx = x64_idx + ((i & 1) << 5);
+      const int y32_idx = y64_idx + ((i >> 1) << 5);
+      const int i2 = (m2 + i) << 2;
+      force_split[5 + m2 + i] = 0;
+      avg_16x16[i] = 0;
+      maxvar_16x16[i] = 0;
+      minvar_16x16[i] = INT_MAX;
+      for (j = 0; j < 4; j++) {
+        const int x16_idx = x32_idx + ((j & 1) << 4);
+        const int y16_idx = y32_idx + ((j >> 1) << 4);
+        const int split_index = 21 + i2 + j;
+        v16x16 *vst = &vt->split[m].split[i].split[j];
+        force_split[split_index] = 0;
+        variance4x4downsample[i2 + j] = 0;
+        if (!is_key_frame) {
+          fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst, pixels_wide,
+                               pixels_high, is_key_frame);
+          fill_variance_tree(&vt->split[m].split[i].split[j], BLOCK_16X16);
+          get_variance(&vt->split[m].split[i].split[j].part_variances.none);
+          avg_16x16[i] +=
+              vt->split[m].split[i].split[j].part_variances.none.variance;
+          if (vt->split[m].split[i].split[j].part_variances.none.variance <
+              minvar_16x16[i])
+            minvar_16x16[i] =
+                vt->split[m].split[i].split[j].part_variances.none.variance;
+          if (vt->split[m].split[i].split[j].part_variances.none.variance >
+              maxvar_16x16[i])
+            maxvar_16x16[i] =
+                vt->split[m].split[i].split[j].part_variances.none.variance;
+          if (vt->split[m].split[i].split[j].part_variances.none.variance >
+              thresholds[3]) {
+            // 16X16 variance is above threshold for split, so force split to
+            // 8x8 for this 16x16 block (this also forces splits for upper
+            // levels).
+            force_split[split_index] = 1;
+            force_split[5 + m2 + i] = 1;
+            force_split[m + 1] = 1;
+            force_split[0] = 1;
+          } else if (compute_minmax_variance &&
+                     vt->split[m]
+                             .split[i]
+                             .split[j]
+                             .part_variances.none.variance > thresholds[2] &&
+                     !cyclic_refresh_segment_id_boosted(segment_id)) {
+            // We have some nominal amount of 16x16 variance (based on average),
+            // compute the minmax over the 8x8 sub-blocks, and if above
+            // threshold, force split to 8x8 block for this 16x16 block.
+            int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
+                                            pixels_wide, pixels_high);
+            int thresh_minmax = (int)cpi->vbp_threshold_minmax;
+            if (minmax > thresh_minmax) {
+              force_split[split_index] = 1;
+              force_split[5 + m2 + i] = 1;
+              force_split[m + 1] = 1;
+              force_split[0] = 1;
+            }
+          }
+        }
+        if (is_key_frame) {
+          force_split[split_index] = 0;
+          // Go down to 4x4 down-sampling for variance.
+          variance4x4downsample[i2 + j] = 1;
+          for (k = 0; k < 4; k++) {
+            int x8_idx = x16_idx + ((k & 1) << 3);
+            int y8_idx = y16_idx + ((k >> 1) << 3);
+            v8x8 *vst2 = is_key_frame ? &vst->split[k] : &vt2[i2 + j].split[k];
+            fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
+                                 pixels_wide, pixels_high, is_key_frame);
+          }
+        }
+      }
+    }
+  }
+
+  // Fill the rest of the variance tree by summing split partition values.
+  for (m = 0; m < 4; ++m) {
+    avg_32x32 = 0;
+    const int m2 = m << 2;
+    for (i = 0; i < 4; i++) {
+      const int i2 = (m2 + i) << 2;
+      for (j = 0; j < 4; j++) {
+        const int split_index = 21 + i2 + j;
+        if (variance4x4downsample[i2 + j] == 1) {
+          v16x16 *vtemp =
+              (!is_key_frame) ? &vt2[i2 + j] : &vt->split[m].split[i].split[j];
+          for (k = 0; k < 4; k++)
+            fill_variance_tree(&vtemp->split[k], BLOCK_8X8);
+          fill_variance_tree(vtemp, BLOCK_16X16);
+          // If variance of this 16x16 block is above the threshold, force block
+          // to split. This also forces a split on the upper levels.
+          get_variance(&vtemp->part_variances.none);
+          if (vtemp->part_variances.none.variance > thresholds[3]) {
+            force_split[split_index] = 1;
+            force_split[5 + m2 + i] = 1;
+            force_split[m + 1] = 1;
+            force_split[0] = 1;
+          }
+        }
+      }
+      fill_variance_tree(&vt->split[m].split[i], BLOCK_32X32);
+      // If variance of this 32x32 block is above the threshold, or if its above
+      // (some threshold of) the average variance over the sub-16x16 blocks,
+      // then force this block to split. This also forces a split on the upper
+      // (64x64) level.
+      if (!force_split[5 + m2 + i]) {
+        get_variance(&vt->split[m].split[i].part_variances.none);
+        var_32x32 = vt->split[m].split[i].part_variances.none.variance;
+        max_var_32x32 = AOMMAX(var_32x32, max_var_32x32);
+        min_var_32x32 = AOMMIN(var_32x32, min_var_32x32);
+        if (vt->split[m].split[i].part_variances.none.variance >
+                thresholds[2] ||
+            (!is_key_frame &&
+             vt->split[m].split[i].part_variances.none.variance >
+                 (thresholds[2] >> 1) &&
+             vt->split[m].split[i].part_variances.none.variance >
+                 (avg_16x16[i] >> 1))) {
+          force_split[5 + m2 + i] = 1;
+          force_split[m + 1] = 1;
+          force_split[0] = 1;
+        } else if (!is_key_frame && cm->height <= 360 &&
+                   (maxvar_16x16[i] - minvar_16x16[i]) > (thresholds[2] >> 1) &&
+                   maxvar_16x16[i] > thresholds[2]) {
+          force_split[5 + m2 + i] = 1;
+          force_split[m + 1] = 1;
+          force_split[0] = 1;
+        }
+        avg_32x32 += var_32x32;
+      }
+    }
+    if (!force_split[1 + m]) {
+      fill_variance_tree(&vt->split[m], BLOCK_64X64);
+      get_variance(&vt->split[m].part_variances.none);
+      var_64x64 = vt->split[m].part_variances.none.variance;
+      max_var_64x64 = AOMMAX(var_64x64, max_var_64x64);
+      min_var_64x64 = AOMMIN(var_64x64, min_var_64x64);
+      // If variance of this 64x64 block is above (some threshold of) the
+      // average variance over the sub-32x32 blocks, then force this block to
+      // split. Only checking this for noise level >= medium for now.
+
+      if (!is_key_frame &&
+          (max_var_32x32 - min_var_32x32) > 3 * (thresholds[1] >> 3) &&
+          max_var_32x32 > thresholds[1] >> 1)
+        force_split[1 + m] = 1;
+    }
+  }
+
+  if (!force_split[0]) {
+    fill_variance_tree(vt, BLOCK_128X128);
+    get_variance(&vt->part_variances.none);
+    if (!is_key_frame &&
+        (max_var_64x64 - min_var_64x64) > 3 * (thresholds[0] >> 3) &&
+        max_var_64x64 > thresholds[0] >> 1)
+      force_split[0] = 1;
+  }
+
+  const int mi_rows_remaining = tile->mi_row_end - mi_row;
+  const int mi_cols_remaining = tile->mi_col_end - mi_col;
+  if (!((mi_cols_remaining >= cm->seq_params.mib_size) &&
+        (mi_rows_remaining >= cm->seq_params.mib_size))) {
+    int bh = mi_size_high[BLOCK_128X128];
+    int bw = mi_size_wide[BLOCK_128X128];
+
+    int r, c;
+    for (r = 0; r < cm->seq_params.mib_size; r += bh) {
+      bw = mi_size_wide[BLOCK_128X128];
+      for (c = 0; c < cm->seq_params.mib_size; c += bw) {
+        BLOCK_SIZE sb_type =
+            find_partition_size(BLOCK_128X128, mi_rows_remaining - r,
+                                mi_cols_remaining - c, &bh, &bw);
+        set_block_size(cpi, x, xd, mi_row + r, mi_col + c, sb_type);
+      }
+    }
+
+  } else {
+    if (!set_vt_partitioning(cpi, x, xd, vt, BLOCK_128X128, mi_row, mi_col,
+                             thresholds[0], BLOCK_16X16, force_split[0])) {
+      for (m = 0; m < 4; ++m) {
+        const int x64_idx = ((m & 1) << 4);
+        const int y64_idx = ((m >> 1) << 4);
+        const int m2 = m << 2;
+
+        // Now go through the entire structure, splitting every block size until
+        // we get to one that's got a variance lower than our threshold.
+        if (!set_vt_partitioning(cpi, x, xd, &vt->split[m], BLOCK_64X64,
+                                 mi_row + y64_idx, mi_col + x64_idx,
+                                 thresholds[1], BLOCK_16X16,
+                                 force_split[1 + m])) {
+          for (i = 0; i < 4; ++i) {
+            const int x32_idx = ((i & 1) << 3);
+            const int y32_idx = ((i >> 1) << 3);
+            const int i2 = (m2 + i) << 2;
+            if (!set_vt_partitioning(
+                    cpi, x, xd, &vt->split[m].split[i], BLOCK_32X32,
+                    (mi_row + y64_idx + y32_idx), (mi_col + x64_idx + x32_idx),
+                    thresholds[2], BLOCK_16X16, force_split[5 + m2 + i])) {
+              for (j = 0; j < 4; ++j) {
+                const int x16_idx = ((j & 1) << 2);
+                const int y16_idx = ((j >> 1) << 2);
+                const int split_index = 21 + i2 + j;
+                // For inter frames: if variance4x4downsample[] == 1 for this
+                // 16x16 block, then the variance is based on 4x4 down-sampling,
+                // so use vt2 in set_vt_partioning(), otherwise use vt.
+                v16x16 *vtemp =
+                    (!is_key_frame && variance4x4downsample[i2 + j] == 1)
+                        ? &vt2[i2 + j]
+                        : &vt->split[m].split[i].split[j];
+                if (!set_vt_partitioning(cpi, x, xd, vtemp, BLOCK_16X16,
+                                         mi_row + y64_idx + y32_idx + y16_idx,
+                                         mi_col + x64_idx + x32_idx + x16_idx,
+                                         thresholds[3], BLOCK_8X8,
+                                         force_split[split_index])) {
+                  for (k = 0; k < 4; ++k) {
+                    const int x8_idx = (k & 1) << 1;
+                    const int y8_idx = (k >> 1) << 1;
+                    set_block_size(
+                        cpi, x, xd,
+                        (mi_row + y64_idx + y32_idx + y16_idx + y8_idx),
+                        (mi_col + x64_idx + x32_idx + x16_idx + x8_idx),
+                        BLOCK_8X8);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (vt2) aom_free(vt2);
+  if (vt) aom_free(vt);
+  return 0;
+}

diff --git a/av1/encoder/var_based_part.h b/av1/encoder/var_based_part.h
new file mode 100644
index 0000000..c355224
--- /dev/null
+++ b/av1/encoder/var_based_part.h

@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_VAR_BASED_PART_H_
+#define AOM_AV1_ENCODER_VAR_BASED_PART_H_
+
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
+                                           int content_state);
+
+int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+                                      MACROBLOCK *x, int mi_row, int mi_col);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_VAR_BASED_PART_H_

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index c26f572..235480a 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc

@@ -50,7 +50,7 @@
     EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
               aom_codec_enc_init(&enc, kCodecs[i], NULL, 0));
     EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
-              aom_codec_enc_config_default(kCodecs[i], &cfg, 1));
+              aom_codec_enc_config_default(kCodecs[i], &cfg, 2));
 
     EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(kCodecs[i], &cfg, 0));
     EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, kCodecs[i], &cfg, 0));

diff --git a/test/rt_end_to_end_test.cc b/test/rt_end_to_end_test.cc
new file mode 100644
index 0000000..63b602b
--- /dev/null
+++ b/test/rt_end_to_end_test.cc

@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+const unsigned int kFrames = 10;
+const int kBitrate = 500;
+
+// List of psnr thresholds for speed settings 0-8
+const double kPsnrThreshold[9] = { 36.9, 36.9, 36.85, 36.8, 36.6,
+                                   36.4, 36.0, 35.5,  34.5 };
+
+typedef struct {
+  const char *filename;
+  unsigned int input_bit_depth;
+  aom_img_fmt fmt;
+  aom_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+  return os << "TestVideoParam { filename:" << test_arg.filename
+            << " input_bit_depth:" << test_arg.input_bit_depth
+            << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+            << " profile:" << test_arg.profile << "}";
+}
+
+// TODO(kyslov): Add more test vectors
+const TestVideoParam kTestVectors[] = {
+  { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+};
+
+// Speed settings tested
+const int kCpuUsedVectors[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 };
+
+class RTEndToEndTest
+    : public ::libaom_test::CodecTestWith2Params<TestVideoParam, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  RTEndToEndTest()
+      : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0) {}
+
+  virtual ~RTEndToEndTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libaom_test::kRealTime);
+
+    cfg_.g_usage = 1;  // TODO(kyslov): Move it to encode_test_driver.cc
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 1);
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() { return kPsnrThreshold[cpu_used_]; }
+
+  void DoTest() {
+    cfg_.rc_target_bitrate = kBitrate;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    std::unique_ptr<libaom_test::VideoSource> video;
+    video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                kFrames));
+    ASSERT_TRUE(video.get() != NULL);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr = GetAveragePsnr();
+    EXPECT_GT(psnr, GetPsnrThreshold()) << "cpu used = " << cpu_used_;
+  }
+
+  TestVideoParam test_video_param_;
+  int cpu_used_;
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+};
+
+class RTEndToEndTestLarge : public RTEndToEndTest {};
+
+TEST_P(RTEndToEndTestLarge, EndtoEndPSNRTest) { DoTest(); }
+
+TEST_P(RTEndToEndTest, EndtoEndPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_CASE(RTEndToEndTestLarge,
+                          ::testing::ValuesIn(kTestVectors),
+                          ::testing::ValuesIn(kCpuUsedVectors));
+
+AV1_INSTANTIATE_TEST_CASE(RTEndToEndTest, ::testing::Values(kTestVectors[0]),
+                          ::testing::Values(kCpuUsedVectors[8]));
+}  // namespace

diff --git a/test/test.cmake b/test/test.cmake
index 1b11373..5ae0279 100644
--- a/test/test.cmake
+++ b/test/test.cmake

@@ -64,6 +64,7 @@
             "${AOM_ROOT}/test/encode_test_driver.cc"
             "${AOM_ROOT}/test/encode_test_driver.h"
             "${AOM_ROOT}/test/end_to_end_test.cc"
+            "${AOM_ROOT}/test/rt_end_to_end_test.cc"
             "${AOM_ROOT}/test/error_resilience_test.cc"
             "${AOM_ROOT}/test/frame_size_tests.cc"
             "${AOM_ROOT}/test/horz_superres_test.cc"
commit	7b9d0d62bc351efe5313eb600d21d364be5e4cfb	[log] [tgz]
author	kyslov <kyslov@google.com>	Fri Dec 21 11:12:26 2018 -0800
committer	Fyodor Kyslov <kyslov@google.com>	Wed Feb 06 19:03:07 2019 +0000
tree	3ed2d536fd5143bb4302849631fef5885ab19c17
parent	3ff6ff36eb9b2535efdc236c848973c5cae5594d [diff]