Merge "Remove unnecessary border extension when frame size change."

diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c
index d9dace6..e96bc4f 100644
--- a/vp9/common/vp9_debugmodes.c
+++ b/vp9/common/vp9_debugmodes.c

@@ -25,55 +25,65 @@
 static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor,
                           size_t member_offset) {
   int mi_row, mi_col;
-  int mi_index = 0;
-  // TODO(hkuang): Fix this debug function.
-  MODE_INFO **mi = &cm->mi;
+  MODE_INFO *mi = cm->mi;
   int rows = cm->mi_rows;
   int cols = cm->mi_cols;
   char prefix = descriptor[0];
 
   log_frame_info(cm, descriptor, file);
-  mi_index = 0;
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(file, "%c ", prefix);
     for (mi_col = 0; mi_col < cols; mi_col++) {
       fprintf(file, "%2d ",
-              *((int*) ((char *) (&mi[mi_index]->mbmi) +
-                        member_offset)));
-      mi_index++;
+              *((int*) ((char *) (&mi->src_mi->mbmi) +
+                                  member_offset)));
+      mi++;
     }
     fprintf(file, "\n");
-    mi_index += 8;
+    mi += 8;
   }
   fprintf(file, "\n");
 }
+
 void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, const char *file) {
   int mi_row;
   int mi_col;
-  int mi_index = 0;
   FILE *mvs = fopen(file, "a");
-  // TODO(hkuang): Fix this debug function.
-  MODE_INFO **mi = &cm->mi;
+  MODE_INFO *mi = cm->mi;
   int rows = cm->mi_rows;
   int cols = cm->mi_cols;
 
   print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
   print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
-  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip));
   print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
   print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
   print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
 
+  // output skip infomation.
+  log_frame_info(cm, "Skips:", mvs);
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(mvs, "S ");
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%2d ", mi->src_mi->mbmi.skip);
+      mi++;
+    }
+    fprintf(mvs, "\n");
+    mi += 8;
+  }
+  fprintf(mvs, "\n");
+
+  // output motion vectors.
   log_frame_info(cm, "Vectors ", mvs);
+  mi = cm->mi;
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(mvs, "V ");
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%4d:%4d ", mi[mi_index]->mbmi.mv[0].as_mv.row,
-                               mi[mi_index]->mbmi.mv[0].as_mv.col);
-      mi_index++;
+      fprintf(mvs, "%4d:%4d ", mi->src_mi->mbmi.mv[0].as_mv.row,
+                               mi->src_mi->mbmi.mv[0].as_mv.col);
+      mi++;
     }
     fprintf(mvs, "\n");
-    mi_index += 8;
+    mi += 8;
   }
   fprintf(mvs, "\n");
 

diff --git a/vp9/common/vp9_thread.h b/vp9/common/vp9_thread.h
index c24ef5f..12848fe 100644
--- a/vp9/common/vp9_thread.h
+++ b/vp9/common/vp9_thread.h

@@ -28,7 +28,7 @@
 
 #if CONFIG_MULTITHREAD
 
-#if defined(_WIN32)
+#if defined(_WIN32) && !HAVE_PTHREAD_H
 #include <errno.h>  // NOLINT
 #include <process.h>  // NOLINT
 #include <windows.h>  // NOLINT

diff --git a/vp9/common/x86/vp9_high_intrapred_sse2.asm b/vp9/common/x86/vp9_high_intrapred_sse2.asm
index 721126c..b12d29c 100644
--- a/vp9/common/x86/vp9_high_intrapred_sse2.asm
+++ b/vp9/common/x86/vp9_high_intrapred_sse2.asm

@@ -345,7 +345,7 @@
 
 %if ARCH_X86_64
 INIT_XMM sse2
-cglobal highbd_tm_predictor_16x16, 5, 6, 8, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_16x16, 5, 6, 9, dst, stride, above, left, bps, one
   movd                  m2, [aboveq-2]
   mova                  m0, [aboveq]
   mova                  m1, [aboveq+16]

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 0885909..99bb930 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -471,6 +471,43 @@
   return 0;
 }
 
+
+void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  if (sf->partition_search_type != VAR_BASED_PARTITION) {
+    return;
+  } else {
+    VP9_COMMON *const cm = &cpi->common;
+    const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+    const int is_key_frame = (cm->frame_type == KEY_FRAME);
+    const int use_4x4_partition = is_key_frame;
+    const int low_res = (cm->width <= 352 && cm->height <= 288);
+    const int threshold_multiplier = is_key_frame ? 80 : 4;
+    const int64_t threshold_base = (int64_t)(threshold_multiplier *
+        vp9_convert_qindex_to_q(q, cm->bit_depth));
+    cpi->vbp_threshold = threshold_base;
+    cpi->vbp_threshold_bsize_min = threshold_base << oxcf->speed;
+    cpi->vbp_threshold_bsize_max = threshold_base;
+
+    if (is_key_frame) {
+      cpi->vbp_threshold = threshold_base >> 2;
+      cpi->vbp_threshold_bsize_min = threshold_base << 2;
+    } else if (low_res) {
+      cpi->vbp_threshold_bsize_min = threshold_base << 3;
+      cpi->vbp_threshold_bsize_max = threshold_base >> 2;
+    }
+    // TODO(marpan): Allow 4x4 partitions for inter-frames.
+    // use_4x4_partition = (variance4x4downsample[i2 + j] == 1);
+    // If 4x4 partition is not used, then 8x8 partition will be selected
+    // if variance of 16x16 block is very high, so use larger threshold
+    // for 16x16 (threshold_bsize_min) in that case.
+    cpi->vbp_threshold_16x16 = (use_4x4_partition) ?
+      cpi->vbp_threshold : cpi->vbp_threshold_bsize_min;
+    cpi->vbp_bsize_min = (use_4x4_partition) ? BLOCK_8X8 : BLOCK_16X16;
+  }
+}
+
+
 // This function chooses partitioning based on the variance between source and
 // reconstructed last, where variance is computed for downs-sampled inputs.
 static void choose_partitioning(VP9_COMP *cpi,
@@ -479,7 +516,6 @@
                                 int mi_row, int mi_col) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-
   int i, j, k, m;
   v64x64 vt;
   v16x16 vt2[16];
@@ -489,34 +525,12 @@
   int dp;
   int pixels_wide = 64, pixels_high = 64;
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
-  const struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
+
   // Always use 4x4 partition for key frame.
   const int is_key_frame = (cm->frame_type == KEY_FRAME);
   const int use_4x4_partition = is_key_frame;
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
   int variance4x4downsample[16];
-  int low_res = (cm->width <= 352 && cm->height <= 288) ? 1 : 0;
-  const int threshold_multiplier = is_key_frame ? 80 : 4;
-  int64_t threshold_base;
-  int64_t threshold;
-  int64_t threshold_bsize_min;
-  int64_t threshold_bsize_max;
-
-  vp9_clear_system_state();
-  threshold_base = (int64_t)(threshold_multiplier *
-      vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth));
-  threshold = threshold_base;
-  threshold_bsize_min = threshold_base << cpi->oxcf.speed;
-  threshold_bsize_max = threshold_base;
-
-  // Modify thresholds for key frame and for low-resolutions (set lower
-  // thresholds to favor split).
-  if (is_key_frame) {
-    threshold = threshold_base >> 2;
-    threshold_bsize_min = threshold_base << 2;
-  } else if (low_res) {
-    threshold_bsize_min = threshold_base << 3;
-    threshold_bsize_max = threshold_base >> 2;
-  }
 
   set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
 
@@ -531,7 +545,8 @@
   if (!is_key_frame) {
     MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
     unsigned int var = 0, sse;
-    vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, sf);
+    vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+        &cm->frame_refs[LAST_FRAME - 1].sf);
     mbmi->ref_frame[0] = LAST_FRAME;
     mbmi->ref_frame[1] = NONE;
     mbmi->sb_type = BLOCK_64X64;
@@ -619,7 +634,7 @@
       }
       if (is_key_frame || (low_res &&
           vt.split[i].split[j].part_variances.none.variance >
-          (threshold << 1))) {
+          (cpi->vbp_threshold << 1))) {
         // Go down to 4x4 down-sampling for variance.
         variance4x4downsample[i2 + j] = 1;
         for (k = 0; k < 4; k++) {
@@ -680,30 +695,22 @@
   }
   fill_variance_tree(&vt, BLOCK_64X64);
 
-
   // Now go through the entire structure,  splitting every block size until
   // we get to one that's got a variance lower than our threshold.
   if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
       !set_vt_partitioning(cpi, xd, &vt, BLOCK_64X64, mi_row, mi_col,
-                           threshold_bsize_max, BLOCK_16X16)) {
+                           cpi->vbp_threshold_bsize_max, BLOCK_16X16)) {
     for (i = 0; i < 4; ++i) {
       const int x32_idx = ((i & 1) << 2);
       const int y32_idx = ((i >> 1) << 2);
       const int i2 = i << 2;
       if (!set_vt_partitioning(cpi, xd, &vt.split[i], BLOCK_32X32,
                                (mi_row + y32_idx), (mi_col + x32_idx),
-                               threshold, BLOCK_16X16)) {
+                               cpi->vbp_threshold,
+                               BLOCK_16X16)) {
         for (j = 0; j < 4; ++j) {
           const int x16_idx = ((j & 1) << 1);
           const int y16_idx = ((j >> 1) << 1);
-          // TODO(marpan): Allow 4x4 partitions for inter-frames.
-          // use_4x4_partition = (variance4x4downsample[i2 + j] == 1);
-          // If 4x4 partition is not used, then 8x8 partition will be selected
-          // if variance of 16x16 block is very high, so use larger threshold
-          // for 16x16 (threshold_bsize_min) in that case.
-          uint64_t threshold_16x16 = (use_4x4_partition) ? threshold :
-                                      threshold_bsize_min;
-          BLOCK_SIZE bsize_min = (use_4x4_partition) ? BLOCK_8X8 : BLOCK_16X16;
           // For inter frames: if variance4x4downsample[] == 1 for this 16x16
           // block, then the variance is based on 4x4 down-sampling, so use vt2
           // in set_vt_partioning(), otherwise use vt.
@@ -713,7 +720,8 @@
           if (!set_vt_partitioning(cpi, xd, vtemp, BLOCK_16X16,
                                    mi_row + y32_idx + y16_idx,
                                    mi_col + x32_idx + x16_idx,
-                                   threshold_16x16, bsize_min)) {
+                                   cpi->vbp_threshold_16x16,
+                                   cpi->vbp_bsize_min)) {
             for (k = 0; k < 4; ++k) {
               const int x8_idx = (k & 1);
               const int y8_idx = (k >> 1);
@@ -722,7 +730,8 @@
                                          BLOCK_8X8,
                                          mi_row + y32_idx + y16_idx + y8_idx,
                                          mi_col + x32_idx + x16_idx + x8_idx,
-                                         threshold_bsize_min, BLOCK_8X8)) {
+                                         cpi->vbp_threshold_bsize_min,
+                                         BLOCK_8X8)) {
                   set_block_size(cpi, xd,
                                  (mi_row + y32_idx + y16_idx + y8_idx),
                                  (mi_col + x32_idx + x16_idx + x8_idx),

diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h
index 556f3a5..8d545b6 100644
--- a/vp9/encoder/vp9_encodeframe.h
+++ b/vp9/encoder/vp9_encodeframe.h

@@ -38,6 +38,8 @@
 void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td,
                      int tile_row, int tile_col);
 
+void vp9_set_vbp_thresholds(struct VP9_COMP *cpi, int q);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 65b6605..e2ed95c 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c

@@ -50,6 +50,7 @@
 #include "vp9/encoder/vp9_temporal_filter.h"
 #include "vp9/encoder/vp9_resize.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
+#include "vp9/encoder/vp9_skin_detection.h"
 
 
 #define SHARP_FILTER_QTHRESH 0          /* Q threshold for 8-tap sharp filter */
@@ -60,12 +61,14 @@
                                          // mv. Choose a very high value for
                                          // now so that HIGH_PRECISION is always
                                          // chosen.
-
 // #define OUTPUT_YUV_REC
 
 #ifdef OUTPUT_YUV_DENOISED
 FILE *yuv_denoised_file = NULL;
 #endif
+#ifdef OUTPUT_YUV_SKINMAP
+FILE *yuv_skinmap_file = NULL;
+#endif
 #ifdef OUTPUT_YUV_REC
 FILE *yuv_rec_file;
 #endif
@@ -1541,6 +1544,9 @@
   yuv_denoised_file = fopen("denoised.yuv", "ab");
 #endif
 #endif
+#ifdef OUTPUT_YUV_SKINMAP
+  yuv_skinmap_file = fopen("skinmap.yuv", "ab");
+#endif
 #ifdef OUTPUT_YUV_REC
   yuv_rec_file = fopen("rec.yuv", "wb");
 #endif
@@ -1829,6 +1835,9 @@
   fclose(yuv_denoised_file);
 #endif
 #endif
+#ifdef OUTPUT_YUV_SKINMAP
+  fclose(yuv_skinmap_file);
+#endif
 #ifdef OUTPUT_YUV_REC
   fclose(yuv_rec_file);
 #endif
@@ -2116,8 +2125,7 @@
   return 0;
 }
 
-#if CONFIG_VP9_TEMPORAL_DENOISING
-#if defined(OUTPUT_YUV_DENOISED)
+#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP)
 // The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
 // as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
 // not denoise the UV channels at this time. If ever we implement UV channel
@@ -2148,7 +2156,6 @@
   } while (--h);
 }
 #endif
-#endif
 
 #ifdef OUTPUT_YUV_REC
 void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
@@ -2800,7 +2807,10 @@
   set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
 
   vp9_set_quantizer(cm, q);
+  vp9_set_vbp_thresholds(cpi, q);
+
   setup_frame(cpi);
+
   // Variance adaptive and in frame q adjustment experiments are mutually
   // exclusive.
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
@@ -3296,7 +3306,11 @@
   }
 #endif
 #endif
-
+#ifdef OUTPUT_YUV_SKINMAP
+  if (cpi->common.current_video_frame > 1) {
+    vp9_compute_skin_map(cpi, yuv_skinmap_file);
+  }
+#endif
 
   // Special case code to reduce pulsing when key frames are forced at a
   // fixed interval. Note the reconstruction error if it is the frame before

diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 1e4c982..35c5a48 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h

@@ -450,6 +450,13 @@
 
   int resize_pending;
 
+  // VAR_BASED_PARTITION thresholds
+  int64_t vbp_threshold;
+  int64_t vbp_threshold_bsize_min;
+  int64_t vbp_threshold_bsize_max;
+  int64_t vbp_threshold_16x16;
+  BLOCK_SIZE vbp_bsize_min;
+
   // Multi-threading
   int num_workers;
   VP9Worker *workers;

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 70e90b6..071747e 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -577,6 +577,9 @@
     ZEROMV, NEARESTMV, NEARMV, NEWMV,
 };
 
+static const int ref_frame_cost[MAX_REF_FRAMES] = {
+    1235, 229, 530, 615,
+};
 // TODO(jingning) placeholder for inter-frame non-RD mode decision.
 // this needs various further optimizations. to be continued..
 void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
@@ -807,9 +810,8 @@
           vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
           model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter],
                             &pf_dist[filter], &pf_var[filter], &pf_sse[filter]);
-          cost = RDCOST(x->rdmult, x->rddiv,
-                        vp9_get_switchable_rate(cpi, xd) + pf_rate[filter],
-                        pf_dist[filter]);
+          pf_rate[filter] += vp9_get_switchable_rate(cpi, xd);
+          cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]);
           pf_tx_size[filter] = mbmi->tx_size;
           if (cost < best_cost) {
             best_filter = filter;
@@ -846,6 +848,8 @@
         vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
         model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
                           &var_y, &sse_y);
+        this_rdc.rate += cm->interp_filter == SWITCHABLE ?
+            vp9_get_switchable_rate(cpi, xd) : 0;
       }
 
       // chroma component rate-distortion cost modeling
@@ -865,6 +869,7 @@
       this_rdc.rate += rate_mv;
       this_rdc.rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
                                   [INTER_OFFSET(this_mode)];
+      this_rdc.rate += ref_frame_cost[ref_frame];
       this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                                this_rdc.rate, this_rdc.dist);
 
@@ -971,6 +976,7 @@
       this_rdc.rate = args.rate;
       this_rdc.dist = args.dist;
       this_rdc.rate += cpi->mbmode_cost[this_mode];
+      this_rdc.rate += ref_frame_cost[INTRA_FRAME];
       this_rdc.rate += intra_cost_penalty;
       this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                                this_rdc.rate, this_rdc.dist);
@@ -1124,8 +1130,6 @@
         int64_t b_best_rd = INT64_MAX;
         const int i = idy * 2 + idx;
         PREDICTION_MODE this_mode;
-        int b_rate = 0;
-        int64_t b_dist = 0;
         RD_COST this_rdc;
         unsigned int var_y, sse_y;
 
@@ -1153,6 +1157,7 @@
                                       &b_mv[NEARMV]);
 
         for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+          int b_rate = 0;
           xd->mi[0].bmi[i].as_mv[0].as_int = b_mv[this_mode].as_int;
 
           if (this_mode == NEWMV) {
@@ -1214,6 +1219,9 @@
                                          &x->pred_sse[ref_frame], NULL, 0, 0);
 
             xd->mi[0].bmi[i].as_mv[0].as_mv = tmp_mv;
+          } else {
+            b_rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
+                                          [INTER_OFFSET(this_mode)];
           }
 
           vp9_build_inter_predictor(pd->pre[0].buf, pd->pre[0].stride,
@@ -1230,7 +1238,6 @@
                             &var_y, &sse_y);
 
           this_rdc.rate += b_rate;
-          this_rdc.dist += b_dist;
           this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                                    this_rdc.rate, this_rdc.dist);
           if (this_rdc.rdcost < b_best_rd) {

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 8d316d6..6975137 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c

@@ -1173,8 +1173,8 @@
   // Modify frame size target when down-scaling.
   if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC &&
       rc->frame_size_selector != UNSCALED)
-    rc->this_frame_target =
-        rc->this_frame_target * rate_thresh_mult[rc->frame_size_selector];
+    rc->this_frame_target = (int)(rc->this_frame_target
+        * rate_thresh_mult[rc->frame_size_selector]);
 
   // Target rate per SB64 (including partial SB64s.
   rc->sb64_target_rate = ((int64_t)rc->this_frame_target * 64 * 64) /

diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 20ca4ca..0b6d11e 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c

@@ -279,9 +279,11 @@
 
   set_block_thresholds(cm, rd);
 
-  if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME) {
+  if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME)
     fill_token_costs(x->token_costs, cm->fc->coef_probs);
 
+  if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
+      cm->frame_type == KEY_FRAME) {
     for (i = 0; i < PARTITION_CONTEXTS; ++i)
       vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(cm, i),
                       vp9_partition_tree);

diff --git a/vp9/encoder/vp9_skin_detection.c b/vp9/encoder/vp9_skin_detection.c
new file mode 100644
index 0000000..08ebcf8
--- /dev/null
+++ b/vp9/encoder/vp9_skin_detection.c

@@ -0,0 +1,104 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_skin_detection.h"
+
+// Fixed-point skin color model parameters.
+static const int skin_mean[2] = {7463, 9614};                 // q6
+static const int skin_inv_cov[4] = {4107, 1663, 1663, 2157};  // q16
+static const int skin_threshold = 1570636;                    // q18
+
+// Thresholds on luminance.
+static const int y_low = 20;
+static const int y_high = 220;
+
+// Evaluates the Mahalanobis distance measure for the input CbCr values.
+static int evaluate_skin_color_difference(int cb, int cr) {
+  const int cb_q6 = cb << 6;
+  const int cr_q6 = cr << 6;
+  const int cb_diff_q12 = (cb_q6 - skin_mean[0]) * (cb_q6 - skin_mean[0]);
+  const int cbcr_diff_q12 = (cb_q6 - skin_mean[0]) * (cr_q6 - skin_mean[1]);
+  const int cr_diff_q12 = (cr_q6 - skin_mean[1]) * (cr_q6 - skin_mean[1]);
+  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
+  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
+  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
+  const int skin_diff = skin_inv_cov[0] * cb_diff_q2 +
+      skin_inv_cov[1] * cbcr_diff_q2 +
+      skin_inv_cov[2] * cbcr_diff_q2 +
+      skin_inv_cov[3] * cr_diff_q2;
+  return skin_diff;
+}
+
+int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr) {
+  if (y < y_low || y > y_high)
+    return 0;
+  else
+    return (evaluate_skin_color_difference(cb, cr) < skin_threshold);
+}
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
+  int i, j, mi_row, mi_col;
+  VP9_COMMON *const cm = &cpi->common;
+  uint8_t *y;
+  const uint8_t *src_y = cpi->Source->y_buffer;
+  const uint8_t *src_u = cpi->Source->u_buffer;
+  const uint8_t *src_v = cpi->Source->v_buffer;
+  const int src_ystride = cpi->Source->y_stride;
+  const int src_uvstride = cpi->Source->uv_stride;
+  YV12_BUFFER_CONFIG skinmap;
+  vpx_memset(&skinmap, 0, sizeof(YV12_BUFFER_CONFIG));
+  if (vp9_alloc_frame_buffer(&skinmap, cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment)) {
+      vp9_free_frame_buffer(&skinmap);
+      return;
+  }
+  vpx_memset(skinmap.buffer_alloc, 128, skinmap.frame_size);
+  y = skinmap.y_buffer;
+  // Loop through 8x8 blocks and set skin map based on center pixel of block.
+  // Set y to white for skin block, otherwise set to source with gray scale.
+  // Ignore rightmost/bottom boundary blocks.
+  for (mi_row = 0; mi_row < cm->mi_rows - 1; ++mi_row) {
+    for (mi_col = 0; mi_col < cm->mi_cols - 1; ++mi_col) {
+      // Use middle pixel for each 8x8 block for skin detection.
+      // If middle pixel is skin, assign whole 8x8 block to skin.
+      const uint8_t ysource = src_y[4 * src_ystride + 4];
+      const uint8_t usource = src_u[2 * src_uvstride + 2];
+      const uint8_t vsource = src_v[2 * src_uvstride + 2];
+      const int is_skin = vp9_skin_pixel(ysource, usource, vsource);
+      for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+          if (is_skin)
+            y[i * src_ystride + j] = 255;
+          else
+            y[i * src_ystride + j] = src_y[i * src_ystride + j];
+        }
+      }
+      y += 8;
+      src_y += 8;
+      src_u += 4;
+      src_v += 4;
+    }
+    y += (src_ystride << 3) - ((cm->mi_cols - 1) << 3);
+    src_y += (src_ystride << 3) - ((cm->mi_cols - 1) << 3);
+    src_u += (src_uvstride << 2) - ((cm->mi_cols - 1) << 2);
+    src_v += (src_uvstride << 2) - ((cm->mi_cols - 1) << 2);
+  }
+  vp9_write_yuv_frame_420(&skinmap, yuv_skinmap_file);
+  vp9_free_frame_buffer(&skinmap);
+}
+#endif

diff --git a/vp9/encoder/vp9_skin_detection.h b/vp9/encoder/vp9_skin_detection.h
new file mode 100644
index 0000000..3d4e737
--- /dev/null
+++ b/vp9/encoder/vp9_skin_detection.h

@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_SKIN_MAP_H_
+#define VP9_ENCODER_VP9_SKIN_MAP_H_
+
+#include "vp9/common/vp9_blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP9_COMP;
+
+// #define OUTPUT_YUV_SKINMAP
+
+int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr);
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_SKIN_MAP_H_

diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index eaa0acc..8722d9c 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h

@@ -421,4 +421,3 @@
 #endif
 
 #endif  // VP9_ENCODER_VP9_SPEED_FEATURES_H_
-

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index fbdd4ba..2b3f894 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -88,6 +88,8 @@
 VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.h
 VP9_CX_SRCS-yes += encoder/vp9_aq_complexity.c
 VP9_CX_SRCS-yes += encoder/vp9_aq_complexity.h
+VP9_CX_SRCS-yes += encoder/vp9_skin_detection.c
+VP9_CX_SRCS-yes += encoder/vp9_skin_detection.h
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c

diff --git a/vpxenc.c b/vpxenc.c
index f42e02c..46af600 100644
--- a/vpxenc.c
+++ b/vpxenc.c

@@ -398,6 +398,22 @@
     NULL, "frame-boost", 1,
     "Enable frame periodic boost (0: off (default), 1: on)");
 
+static const struct arg_enum_list color_space_enum[] = {
+  { "unknown", VPX_CS_UNKNOWN },
+  { "bt601", VPX_CS_BT_601 },
+  { "bt709", VPX_CS_BT_709 },
+  { "smpte170", VPX_CS_SMPTE_170 },
+  { "smpte240", VPX_CS_SMPTE_240 },
+  { "bt2020", VPX_CS_BT_2020 },
+  { "reserved", VPX_CS_RESERVED },
+  { "sRGB", VPX_CS_SRGB },
+  { NULL, 0 }
+};
+
+static const arg_def_t input_color_space = ARG_DEF_ENUM(
+    NULL, "color-space", 1,
+    "The color space of input content:", color_space_enum);
+
 #if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
 static const struct arg_enum_list bitdepth_enum[] = {
   {"8",  VPX_BITS_8},
@@ -429,7 +445,7 @@
   &tune_ssim, &cq_level, &max_intra_rate_pct, &max_inter_rate_pct,
   &gf_cbr_boost_pct, &lossless,
   &frame_parallel_decoding, &aq_mode, &frame_periodic_boost,
-  &noise_sens, &tune_content,
+  &noise_sens, &tune_content, &input_color_space,
 #if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
   &bitdeptharg, &inbitdeptharg,
 #endif