Merge branch 'master' into nextgenv2
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index bc85daf..2bebdcb 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -141,7 +141,7 @@
                                        &vpx_highbd_tm_predictor_16x16_c, 16, 8),
                             make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
                                        &vpx_highbd_tm_predictor_32x32_c, 32, 8),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 8),
@@ -155,14 +155,14 @@
                                        &vpx_highbd_v_predictor_16x16_c, 16, 8),
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32, 8),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
 #else
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                         ::testing::Values(
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 8),
@@ -176,7 +176,7 @@
                                        &vpx_highbd_v_predictor_16x16_c, 16, 8),
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32, 8),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
@@ -194,7 +194,7 @@
                             make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
                                        &vpx_highbd_tm_predictor_32x32_c, 32,
                                        10),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 10),
@@ -211,14 +211,14 @@
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32,
                                        10),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
 #else
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
                         ::testing::Values(
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 10),
@@ -233,7 +233,7 @@
                                        &vpx_highbd_v_predictor_16x16_c, 16, 10),
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32, 10),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
@@ -251,7 +251,7 @@
                             make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
                                        &vpx_highbd_tm_predictor_32x32_c, 32,
                                        12),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 12),
@@ -268,14 +268,14 @@
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32,
                                        12),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
 #else
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                         ::testing::Values(
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 12),
@@ -290,7 +290,7 @@
                                        &vpx_highbd_v_predictor_16x16_c, 16, 12),
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32, 12),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 2270a06..4295204 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -190,7 +190,8 @@
                                        BLOCK_SIZE bsize,
                                        int64_t rate,
                                        int64_t dist,
-                                       int skip) {
+                                       int skip,
+                                       struct macroblock_plane *const p) {
   const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const int bw = num_8x8_blocks_wide_lookup[bsize];
@@ -198,12 +199,33 @@
   const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
   const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
   const int block_index = mi_row * cm->mi_cols + mi_col;
-  const int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist,
-                                                      bsize);
+  int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist, bsize);
   // Default is to not update the refresh map.
   int new_map_value = cr->map[block_index];
   int x = 0; int y = 0;
 
+  int is_skin = 0;
+  if (refresh_this_block == 0 &&
+      bsize <= BLOCK_16X16 &&
+      cpi->oxcf.content != VP9E_CONTENT_SCREEN) {
+    // Take center pixel in block to determine is_skin.
+    const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1;
+    const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1;
+    const int uv_width_shift = y_width_shift >> 1;
+    const int uv_height_shift = y_height_shift >> 1;
+    const int stride = p[0].src.stride;
+    const int strideuv = p[1].src.stride;
+    const uint8_t ysource =
+        p[0].src.buf[y_height_shift * stride + y_width_shift];
+    const uint8_t usource =
+        p[1].src.buf[uv_height_shift * strideuv + uv_width_shift];
+    const uint8_t vsource =
+        p[2].src.buf[uv_height_shift * strideuv + uv_width_shift];
+    is_skin = vp9_skin_pixel(ysource, usource, vsource);
+    if (is_skin)
+      refresh_this_block = 1;
+  }
+
   // If this block is labeled for refresh, check if we should reset the
   // segment_id.
   if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h
index a5b3813..edf0a97 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -14,6 +14,8 @@
 
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_skin_detection.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -93,7 +95,8 @@
 void vp9_cyclic_refresh_update_segment(struct VP9_COMP *const cpi,
                                        MB_MODE_INFO *const mbmi,
                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
-                                       int64_t rate, int64_t dist, int skip);
+                                       int64_t rate, int64_t dist, int skip,
+                                       struct macroblock_plane *const p);
 
 void vp9_cyclic_refresh_update_sb_postencode(struct VP9_COMP *const cpi,
                                              const MB_MODE_INFO *const mbmi,
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index f002788..7f94e19 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1045,7 +1045,7 @@
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
       vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row,
                                         mi_col, bsize, ctx->rate, ctx->dist,
-                                        x->skip);
+                                        x->skip, p);
     }
   }
 
@@ -1705,6 +1705,7 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *const p = x->plane;
   const struct segmentation *const seg = &cm->seg;
   const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
   const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
@@ -1725,7 +1726,7 @@
     } else {
     // Setting segmentation map for cyclic_refresh.
       vp9_cyclic_refresh_update_segment(cpi, mbmi, mi_row, mi_col, bsize,
-                                        ctx->rate, ctx->dist, x->skip);
+                                        ctx->rate, ctx->dist, x->skip, p);
     }
     vp9_init_plane_quantizers(cpi, x);
   }
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index cda38f6..ab647a4 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1349,11 +1349,25 @@
   const InterpKernel *kernel = vp9_filter_kernels[mi->mbmi.interp_filter];
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const uint8_t *pre = &pd->pre[ref].buf[vp9_raster_block_offset(BLOCK_8X8, i,
-                                               pd->pre[ref].stride)];
+    const int bw = b_width_log2_lookup[BLOCK_8X8];
+    const int h = 4 * (i >> bw);
+    const int w = 4 * (i & ((1 << bw) - 1));
+    const struct scale_factors *sf = &xd->block_refs[ref]->sf;
+    int y_stride = pd->pre[ref].stride;
+    uint8_t *pre = pd->pre[ref].buf + (h * pd->pre[ref].stride + w);
+
+    if (vp9_is_scaled(sf)) {
+      const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
+      const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+
+      y_stride = xd->block_refs[ref]->buf->y_stride;
+      pre = xd->block_refs[ref]->buf->y_buffer;
+      pre += scaled_buffer_offset(x_start + w, y_start + h,
+                                  y_stride, sf);
+    }
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_highbd_build_inter_predictor(pre, pd->pre[ref].stride,
+    vp9_highbd_build_inter_predictor(pre, y_stride,
                                      dst, pd->dst.stride,
                                      &mi->bmi[i].as_mv[ref].as_mv,
                                      &xd->block_refs[ref]->sf, width, height,
@@ -1361,7 +1375,7 @@
                                      mi_col * MI_SIZE + 4 * (i % 2),
                                      mi_row * MI_SIZE + 4 * (i / 2), xd->bd);
   } else {
-    vp9_build_inter_predictor(pre, pd->pre[ref].stride,
+    vp9_build_inter_predictor(pre, y_stride,
                               dst, pd->dst.stride,
                               &mi->bmi[i].as_mv[ref].as_mv,
                               &xd->block_refs[ref]->sf, width, height, ref,
@@ -1370,7 +1384,7 @@
                               mi_row * MI_SIZE + 4 * (i / 2));
   }
 #else
-    vp9_build_inter_predictor(pre, pd->pre[ref].stride,
+    vp9_build_inter_predictor(pre, y_stride,
                               dst, pd->dst.stride,
                               &mi->bmi[i].as_mv[ref].as_mv,
                               &xd->block_refs[ref]->sf, width, height, ref,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 3f4fe11..a2a0674 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -291,10 +291,10 @@
   specialize qw/vpx_highbd_v_predictor_4x4/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse_x86inc";
+  specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse_x86inc";
+  specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_4x4/;
diff --git a/vpx_dsp/x86/highbd_intrapred_sse2.asm b/vpx_dsp/x86/highbd_intrapred_sse2.asm
index 9e6dcb5..233958a 100644
--- a/vpx_dsp/x86/highbd_intrapred_sse2.asm
+++ b/vpx_dsp/x86/highbd_intrapred_sse2.asm
@@ -17,24 +17,20 @@
 pw_32: times 4 dd 32
 
 SECTION .text
-INIT_MMX sse
+INIT_XMM sse2
 cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   movq                  m0, [aboveq]
   movq                  m2, [leftq]
-  DEFINE_ARGS dst, stride, one
-  mov                 oned, 0x0001
-  pxor                  m1, m1
-  movd                  m3, oned
-  pshufw                m3, m3, 0x0
   paddw                 m0, m2
-  pmaddwd               m0, m3
-  packssdw              m0, m1
-  pmaddwd               m0, m3
+  pshuflw               m1, m0, 0xe
+  paddw                 m0, m1
+  pshuflw               m1, m0, 0x1
+  paddw                 m0, m1
   paddw                 m0, [GLOBAL(pw_4)]
   psraw                 m0, 3
-  pshufw                m0, m0, 0x0
+  pshuflw               m0, m0, 0x0
   movq    [dstq          ], m0
   movq    [dstq+strideq*2], m0
   lea                 dstq, [dstq+strideq*4]
@@ -261,43 +257,44 @@
   jnz .loop
   REP_RET
 
-INIT_MMX sse
-cglobal highbd_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one
+INIT_XMM sse2
+cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
   movd                  m1, [aboveq-2]
   movq                  m0, [aboveq]
-  pshufw                m1, m1, 0x0
+  pshuflw               m1, m1, 0x0
+  movlhps               m0, m0         ; t1 t2 t3 t4 t1 t2 t3 t4
+  movlhps               m1, m1         ; tl tl tl tl tl tl tl tl
   ; Get the values to compute the maximum value at this bit depth
-  mov                 oned, 1
-  movd                  m3, oned
+  pcmpeqw               m3, m3
   movd                  m4, bpsd
-  pshufw                m3, m3, 0x0
-  DEFINE_ARGS dst, stride, line, left
-  mov                lineq, -2
-  mova                  m2, m3
+  psubw                 m0, m1         ; t1-tl t2-tl t3-tl t4-tl
   psllw                 m3, m4
-  add                leftq, 8
-  psubw                 m3, m2 ; max possible value
-  pxor                  m4, m4 ; min possible value
-  psubw                 m0, m1
-.loop:
-  movq                  m1, [leftq+lineq*4]
-  movq                  m2, [leftq+lineq*4+2]
-  pshufw                m1, m1, 0x0
-  pshufw                m2, m2, 0x0
-  paddw                 m1, m0
+  pcmpeqw               m2, m2
+  pxor                  m4, m4         ; min possible value
+  pxor                  m3, m2         ; max possible value
+  mova                  m1, [leftq]
+  pshuflw               m2, m1, 0x0
+  pshuflw               m5, m1, 0x55
+  movlhps               m2, m5         ; l1 l1 l1 l1 l2 l2 l2 l2
   paddw                 m2, m0
   ;Clamp to the bit-depth
-  pminsw                m1, m3
   pminsw                m2, m3
-  pmaxsw                m1, m4
   pmaxsw                m2, m4
   ;Store the values
-  movq    [dstq          ], m1
-  movq    [dstq+strideq*2], m2
+  movq    [dstq          ], m2
+  movhpd  [dstq+strideq*2], m2
   lea                 dstq, [dstq+strideq*4]
-  inc                lineq
-  jnz .loop
-  REP_RET
+  pshuflw               m2, m1, 0xaa
+  pshuflw               m5, m1, 0xff
+  movlhps               m2, m5
+  paddw                 m2, m0
+  ;Clamp to the bit-depth
+  pminsw                m2, m3
+  pmaxsw                m2, m4
+  ;Store the values
+  movq    [dstq          ], m2
+  movhpd  [dstq+strideq*2], m2
+  RET
 
 INIT_XMM sse2
 cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one