Merge "vp9/decode_tiles_mt: remove unnecessary local"

diff --git a/test/vp9_error_block_test.cc b/test/vp9_error_block_test.cc
index 8c5d5a2..d779706 100644
--- a/test/vp9_error_block_test.cc
+++ b/test/vp9_error_block_test.cc

@@ -136,7 +136,23 @@
 
 using std::tr1::make_tuple;
 
-#if HAVE_SSE2
+#if CONFIG_USE_X86INC && HAVE_SSE2
+int64_t wrap_vp9_highbd_block_error_8bit_sse2(const tran_low_t *coeff,
+                                              const tran_low_t *dqcoeff,
+                                              intptr_t block_size,
+                                              int64_t *ssz, int bps) {
+  assert(bps == 8);
+  return vp9_highbd_block_error_8bit_sse2(coeff, dqcoeff, block_size, ssz);
+}
+
+int64_t wrap_vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
+                                           const tran_low_t *dqcoeff,
+                                           intptr_t block_size,
+                                           int64_t *ssz, int bps) {
+  assert(bps == 8);
+  return vp9_highbd_block_error_8bit_c(coeff, dqcoeff, block_size, ssz);
+}
+
 INSTANTIATE_TEST_CASE_P(
     SSE2, ErrorBlockTest,
     ::testing::Values(
@@ -145,7 +161,9 @@
         make_tuple(&vp9_highbd_block_error_sse2,
                    &vp9_highbd_block_error_c, VPX_BITS_12),
         make_tuple(&vp9_highbd_block_error_sse2,
-                   &vp9_highbd_block_error_c, VPX_BITS_8)));
+                   &vp9_highbd_block_error_c, VPX_BITS_8),
+        make_tuple(&wrap_vp9_highbd_block_error_8bit_sse2,
+                   &wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
 #endif  // HAVE_SSE2
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace

diff --git a/vp10/common/onyxc_int.h b/vp10/common/onyxc_int.h
index c345068..e2454b0 100644
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h

@@ -279,6 +279,7 @@
   int error_resilient_mode;
 
   int log2_tile_cols, log2_tile_rows;
+  int tile_sz_mag;
   int byte_alignment;
   int skip_loop_filter;
 

diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index 62c2942..ad02c95 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c

@@ -170,8 +170,12 @@
 static void update_mv_probs(vpx_prob *p, int n, vpx_reader *r) {
   int i;
   for (i = 0; i < n; ++i)
+#if CONFIG_MISC_FIXES
+    vp10_diff_update_prob(r, &p[i]);
+#else
     if (vpx_read(r, MV_UPDATE_PROB))
       p[i] = (vpx_read_literal(r, 7) << 1) | 1;
+#endif
 }
 
 static void read_mv_probs(nmv_context *ctx, int allow_hp, vpx_reader *r) {
@@ -1370,6 +1374,15 @@
   cm->log2_tile_rows = vpx_rb_read_bit(rb);
   if (cm->log2_tile_rows)
     cm->log2_tile_rows += vpx_rb_read_bit(rb);
+
+#if CONFIG_MISC_FIXES
+  // tile size magnitude
+  if (cm->log2_tile_rows > 0 || cm->log2_tile_cols > 0) {
+    cm->tile_sz_mag = vpx_rb_read_literal(rb, 2);
+  }
+#else
+  cm->tile_sz_mag = 3;
+#endif
 }
 
 typedef struct TileBuffer {
@@ -1378,10 +1391,27 @@
   int col;  // only used with multi-threaded decoding
 } TileBuffer;
 
+static int mem_get_varsize(const uint8_t *data, const int mag) {
+  switch (mag) {
+    case 0:
+      return data[0];
+    case 1:
+      return mem_get_le16(data);
+    case 2:
+      return mem_get_le24(data);
+    case 3:
+      return mem_get_le32(data);
+  }
+
+  assert("Invalid tile size marker value" && 0);
+
+  return -1;
+}
+
 // Reads the next tile returning its size and adjusting '*data' accordingly
 // based on 'is_last'.
 static void get_tile_buffer(const uint8_t *const data_end,
-                            int is_last,
+                            const int tile_sz_mag, int is_last,
                             struct vpx_internal_error_info *error_info,
                             const uint8_t **data,
                             vpx_decrypt_cb decrypt_cb, void *decrypt_state,
@@ -1395,12 +1425,12 @@
 
     if (decrypt_cb) {
       uint8_t be_data[4];
-      decrypt_cb(decrypt_state, *data, be_data, 4);
-      size = mem_get_be32(be_data);
+      decrypt_cb(decrypt_state, *data, be_data, tile_sz_mag + 1);
+      size = mem_get_varsize(be_data, tile_sz_mag);
     } else {
-      size = mem_get_be32(*data);
+      size = mem_get_varsize(*data, tile_sz_mag);
     }
-    *data += 4;
+    *data += tile_sz_mag + 1;
 
     if (size > (size_t)(data_end - *data))
       vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
@@ -1426,7 +1456,8 @@
       const int is_last = (r == tile_rows - 1) && (c == tile_cols - 1);
       TileBuffer *const buf = &tile_buffers[r][c];
       buf->col = c;
-      get_tile_buffer(data_end, is_last, &pbi->common.error, &data,
+      get_tile_buffer(data_end, pbi->common.tile_sz_mag,
+                      is_last, &pbi->common.error, &data,
                       pbi->decrypt_cb, pbi->decrypt_state, buf);
     }
   }

diff --git a/vp10/decoder/decoder.c b/vp10/decoder/decoder.c
index 23851af..03a81f5 100644
--- a/vp10/decoder/decoder.c
+++ b/vp10/decoder/decoder.c

@@ -459,6 +459,9 @@
   // an invalid bitstream and need to return an error.
 
   uint8_t marker;
+#if CONFIG_MISC_FIXES
+  size_t frame_sz_sum = 0;
+#endif
 
   assert(data_sz);
   marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1);
@@ -467,7 +470,7 @@
   if ((marker & 0xe0) == 0xc0) {
     const uint32_t frames = (marker & 0x7) + 1;
     const uint32_t mag = ((marker >> 3) & 0x3) + 1;
-    const size_t index_sz = 2 + mag * frames;
+    const size_t index_sz = 2 + mag * (frames - CONFIG_MISC_FIXES);
 
     // This chunk is marked as having a superframe index but doesn't have
     // enough data for it, thus it's an invalid superframe index.
@@ -498,13 +501,19 @@
         x = clear_buffer;
       }
 
-      for (i = 0; i < frames; ++i) {
+      for (i = 0; i < frames - CONFIG_MISC_FIXES; ++i) {
         uint32_t this_sz = 0;
 
         for (j = 0; j < mag; ++j)
           this_sz |= (*x++) << (j * 8);
         sizes[i] = this_sz;
+#if CONFIG_MISC_FIXES
+        frame_sz_sum += this_sz;
+#endif
       }
+#if CONFIG_MISC_FIXES
+      sizes[i] = data_sz - index_sz - frame_sz_sum;
+#endif
       *count = frames;
     }
   }

diff --git a/vp10/decoder/detokenize.c b/vp10/decoder/detokenize.c
index 2902ece..d39e3dc 100644
--- a/vp10/decoder/detokenize.c
+++ b/vp10/decoder/detokenize.c

@@ -163,26 +163,33 @@
         case CATEGORY5_TOKEN:
           val = CAT5_MIN_VAL + read_coeff(cat5_prob, 5, r);
           break;
-        case CATEGORY6_TOKEN:
+        case CATEGORY6_TOKEN: {
+#if CONFIG_MISC_FIXES
+          const int skip_bits = TX_SIZES - 1 - tx_size;
+#else
+          const int skip_bits = 0;
+#endif
+          const uint8_t *cat6p = cat6_prob + skip_bits;
 #if CONFIG_VP9_HIGHBITDEPTH
           switch (xd->bd) {
             case VPX_BITS_8:
-              val = CAT6_MIN_VAL + read_coeff(cat6_prob, 14, r);
+              val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, r);
               break;
             case VPX_BITS_10:
-              val = CAT6_MIN_VAL + read_coeff(cat6_prob, 16, r);
+              val = CAT6_MIN_VAL + read_coeff(cat6p, 16 - skip_bits, r);
               break;
             case VPX_BITS_12:
-              val = CAT6_MIN_VAL + read_coeff(cat6_prob, 18, r);
+              val = CAT6_MIN_VAL + read_coeff(cat6p, 18 - skip_bits, r);
               break;
             default:
               assert(0);
               return -1;
           }
 #else
-          val = CAT6_MIN_VAL + read_coeff(cat6_prob, 14, r);
+          val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, r);
 #endif
           break;
+        }
       }
     }
     v = (val * dqv) >> dq_shift;

diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 485fc53..1661fbd 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c

@@ -122,8 +122,11 @@
 
 static void pack_mb_tokens(vpx_writer *w,
                            TOKENEXTRA **tp, const TOKENEXTRA *const stop,
-                           vpx_bit_depth_t bit_depth) {
+                           vpx_bit_depth_t bit_depth, const TX_SIZE tx) {
   TOKENEXTRA *p = *tp;
+#if !CONFIG_MISC_FIXES
+  (void) tx;
+#endif
 
   while (p < stop && p->token != EOSB_TOKEN) {
     const int t = p->token;
@@ -171,6 +174,12 @@
 
     if (b->base_val) {
       const int e = p->extra, l = b->len;
+#if CONFIG_MISC_FIXES
+      int skip_bits =
+          (b->base_val == CAT6_MIN_VAL) ? TX_SIZES - 1 - tx : 0;
+#else
+      int skip_bits = 0;
+#endif
 
       if (l) {
         const unsigned char *pb = b->prob;
@@ -180,7 +189,12 @@
 
         do {
           const int bb = (v >> --n) & 1;
-          vpx_write(w, bb, pb[i >> 1]);
+          if (skip_bits) {
+            skip_bits--;
+            assert(!bb);
+          } else {
+            vpx_write(w, bb, pb[i >> 1]);
+          }
           i = b->tree[i + bb];
         } while (n);
       }
@@ -190,7 +204,7 @@
     ++p;
   }
 
-  *tp = p + (p->token == EOSB_TOKEN);
+  *tp = p;
 }
 
 static void write_segment_id(vpx_writer *w, const struct segmentation *seg,
@@ -382,6 +396,7 @@
   const VP10_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   MODE_INFO *m;
+  int plane;
 
   xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
   m = xd->mi[0];
@@ -398,8 +413,16 @@
     pack_inter_mode_mvs(cpi, m, w);
   }
 
-  assert(*tok < tok_end);
-  pack_mb_tokens(w, tok, tok_end, cm->bit_depth);
+  if (!m->mbmi.skip) {
+    assert(*tok < tok_end);
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      TX_SIZE tx = plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane])
+                         : m->mbmi.tx_size;
+      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
+      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+      (*tok)++;
+    }
+  }
 }
 
 static void write_partition(const VP10_COMMON *const cm,
@@ -940,7 +963,8 @@
   }
 }
 
-static size_t encode_tiles(VP10_COMP *cpi, uint8_t *data_ptr) {
+static size_t encode_tiles(VP10_COMP *cpi, uint8_t *data_ptr,
+                           unsigned int *max_tile_sz) {
   VP10_COMMON *const cm = &cpi->common;
   vpx_writer residual_bc;
   int tile_row, tile_col;
@@ -948,6 +972,7 @@
   size_t total_size = 0;
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
+  unsigned int max_tile = 0;
 
   memset(cm->above_seg_context, 0,
          sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
@@ -971,13 +996,15 @@
       vpx_stop_encode(&residual_bc);
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
         // size of this tile
-        mem_put_be32(data_ptr + total_size, residual_bc.pos);
+        mem_put_le32(data_ptr + total_size, residual_bc.pos);
+        max_tile = max_tile > residual_bc.pos ? max_tile : residual_bc.pos;
         total_size += 4;
       }
 
       total_size += residual_bc.pos;
     }
   }
+  *max_tile_sz = max_tile;
 
   return total_size;
 }
@@ -1278,15 +1305,62 @@
   return header_bc.pos;
 }
 
-void vp10_pack_bitstream(VP10_COMP *cpi, uint8_t *dest, size_t *size) {
+#if CONFIG_MISC_FIXES
+static int remux_tiles(uint8_t *dest, const int sz,
+                       const int n_tiles, const int mag) {
+  int rpos = 0, wpos = 0, n;
+
+  for (n = 0; n < n_tiles; n++) {
+    int tile_sz;
+
+    if (n == n_tiles - 1) {
+      tile_sz = sz - rpos;
+    } else {
+      tile_sz = mem_get_le32(&dest[rpos]);
+      rpos += 4;
+      switch (mag) {
+        case 0:
+          dest[wpos] = tile_sz;
+          break;
+        case 1:
+          mem_put_le16(&dest[wpos], tile_sz);
+          break;
+        case 2:
+          mem_put_le24(&dest[wpos], tile_sz);
+          break;
+        case 3:  // remuxing should only happen if mag < 3
+        default:
+          assert("Invalid value for tile size magnitude" && 0);
+      }
+      wpos += mag + 1;
+    }
+
+    memmove(&dest[wpos], &dest[rpos], tile_sz);
+    wpos += tile_sz;
+    rpos += tile_sz;
+  }
+
+  assert(rpos > wpos);
+  assert(rpos == sz);
+
+  return wpos;
+}
+#endif
+
+void vp10_pack_bitstream(VP10_COMP *const cpi, uint8_t *dest, size_t *size) {
+  VP10_COMMON *const cm = &cpi->common;
   uint8_t *data = dest;
   size_t first_part_size, uncompressed_hdr_size;
   struct vpx_write_bit_buffer wb = {data, 0};
   struct vpx_write_bit_buffer saved_wb;
+  unsigned int max_tile, data_sz;
+  const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
+  const int have_tiles = n_log2_tiles > 0;
 
   write_uncompressed_header(cpi, &wb);
   saved_wb = wb;
-  vpx_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
+  // don't know in advance first part. size
+  vpx_wb_write_literal(&wb, 0, 16 + have_tiles * 2);
 
   uncompressed_hdr_size = vpx_wb_bytes_written(&wb);
   data += uncompressed_hdr_size;
@@ -1295,10 +1369,32 @@
 
   first_part_size = write_compressed_header(cpi, data);
   data += first_part_size;
+
+  data_sz = encode_tiles(cpi, data, &max_tile);
+#if CONFIG_MISC_FIXES
+  if (max_tile > 0) {
+    int mag;
+    unsigned int mask;
+
+    // Choose the (tile size) magnitude
+    for (mag = 0, mask = 0xff; mag < 4; mag++) {
+      if (max_tile <= mask)
+        break;
+      mask <<= 8;
+      mask |= 0xff;
+    }
+    assert(n_log2_tiles > 0);
+    vpx_wb_write_literal(&saved_wb, mag, 2);
+    if (mag < 3)
+      data_sz = remux_tiles(data, data_sz, 1 << n_log2_tiles, mag);
+  } else {
+    assert(n_log2_tiles == 0);
+  }
+#endif
+  data += data_sz;
+
   // TODO(jbb): Figure out what to do if first_part_size > 16 bits.
   vpx_wb_write_literal(&saved_wb, (int)first_part_size, 16);
 
-  data += encode_tiles(cpi, data);
-
   *size = data - dest;
 }

diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index 019e5b1..ce1530c 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c

@@ -1356,9 +1356,6 @@
 
   if (output_enabled) {
     update_stats(&cpi->common, td);
-
-    (*tp)->token = EOSB_TOKEN;
-    (*tp)++;
   }
 }
 

diff --git a/vp10/encoder/encodemv.c b/vp10/encoder/encodemv.c
index ca2de1f..0736c65 100644
--- a/vp10/encoder/encodemv.c
+++ b/vp10/encoder/encodemv.c

@@ -15,6 +15,7 @@
 
 #include "vp10/encoder/cost.h"
 #include "vp10/encoder/encodemv.h"
+#include "vp10/encoder/subexp.h"
 
 #include "vpx_dsp/vpx_dsp_common.h"
 
@@ -134,8 +135,12 @@
   }
 }
 
-static int update_mv(vpx_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
-                     vpx_prob upd_p) {
+static void update_mv(vpx_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
+                      vpx_prob upd_p) {
+#if CONFIG_MISC_FIXES
+  (void) upd_p;
+  vp10_cond_prob_diff_update(w, cur_p, ct);
+#else
   const vpx_prob new_p = get_binary_prob(ct[0], ct[1]) | 1;
   const int update = cost_branch256(ct, *cur_p) + vp10_cost_zero(upd_p) >
                      cost_branch256(ct, new_p) + vp10_cost_one(upd_p) + 7 * 256;
@@ -144,7 +149,7 @@
     *cur_p = new_p;
     vpx_write_literal(w, new_p >> 1, 7);
   }
-  return update;
+#endif
 }
 
 static void write_mv_update(const vpx_tree_index *tree,

diff --git a/vp10/encoder/tokenize.c b/vp10/encoder/tokenize.c
index af915fe..cbebd5a 100644
--- a/vp10/encoder/tokenize.c
+++ b/vp10/encoder/tokenize.c

@@ -628,8 +628,16 @@
   }
 
   if (!dry_run) {
+    int plane;
+
     td->counts->skip[ctx][0] += skip_inc;
-    vp10_foreach_transformed_block(xd, bsize, tokenize_b, &arg);
+
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      vp10_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b,
+                                              &arg);
+      (*t)->token = EOSB_TOKEN;
+      (*t)++;
+    }
   } else {
     vp10_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
   }

diff --git a/vp10/vp10_cx_iface.c b/vp10/vp10_cx_iface.c
index 304f74e..409ed1c 100644
--- a/vp10/vp10_cx_iface.c
+++ b/vp10/vp10_cx_iface.c

@@ -91,7 +91,9 @@
   size_t                  pending_cx_data_sz;
   int                     pending_frame_count;
   size_t                  pending_frame_sizes[8];
+#if !CONFIG_MISC_FIXES
   size_t                  pending_frame_magnitude;
+#endif
   vpx_image_t             preview_img;
   vpx_enc_frame_flags_t   next_frame_flags;
   vp8_postproc_cfg_t      preview_ppcfg;
@@ -781,24 +783,39 @@
   uint8_t marker = 0xc0;
   unsigned int mask;
   int mag, index_sz;
+#if CONFIG_MISC_FIXES
+  int i;
+  size_t max_frame_sz = 0;
+#endif
 
   assert(ctx->pending_frame_count);
   assert(ctx->pending_frame_count <= 8);
 
   // Add the number of frames to the marker byte
   marker |= ctx->pending_frame_count - 1;
+#if CONFIG_MISC_FIXES
+  for (i = 0; i < ctx->pending_frame_count - 1; i++) {
+    const size_t frame_sz = (unsigned int) ctx->pending_frame_sizes[i];
+    max_frame_sz = frame_sz > max_frame_sz ? frame_sz : max_frame_sz;
+  }
+#endif
 
   // Choose the magnitude
   for (mag = 0, mask = 0xff; mag < 4; mag++) {
+#if CONFIG_MISC_FIXES
+    if (max_frame_sz <= mask)
+      break;
+#else
     if (ctx->pending_frame_magnitude < mask)
       break;
+#endif
     mask <<= 8;
     mask |= 0xff;
   }
   marker |= mag << 3;
 
   // Write the index
-  index_sz = 2 + (mag + 1) * ctx->pending_frame_count;
+  index_sz = 2 + (mag + 1) * (ctx->pending_frame_count - CONFIG_MISC_FIXES);
   if (ctx->pending_cx_data_sz + index_sz < ctx->cx_data_sz) {
     uint8_t *x = ctx->pending_cx_data + ctx->pending_cx_data_sz;
     int i, j;
@@ -818,7 +835,7 @@
 #endif
 
     *x++ = marker;
-    for (i = 0; i < ctx->pending_frame_count; i++) {
+    for (i = 0; i < ctx->pending_frame_count - CONFIG_MISC_FIXES; i++) {
       unsigned int this_sz = (unsigned int)ctx->pending_frame_sizes[i];
 
       for (j = 0; j <= mag; j++) {
@@ -974,7 +991,9 @@
             ctx->pending_cx_data = cx_data;
           ctx->pending_cx_data_sz += size;
           ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
+#if !CONFIG_MISC_FIXES
           ctx->pending_frame_magnitude |= size;
+#endif
           cx_data += size;
           cx_data_sz -= size;
 
@@ -991,7 +1010,9 @@
             ctx->pending_cx_data = NULL;
             ctx->pending_cx_data_sz = 0;
             ctx->pending_frame_count = 0;
+#if !CONFIG_MISC_FIXES
             ctx->pending_frame_magnitude = 0;
+#endif
             ctx->output_cx_pkt_cb.output_cx_pkt(
                 &pkt, ctx->output_cx_pkt_cb.user_priv);
           }
@@ -1008,7 +1029,9 @@
 
         if (ctx->pending_cx_data) {
           ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
+#if !CONFIG_MISC_FIXES
           ctx->pending_frame_magnitude |= size;
+#endif
           ctx->pending_cx_data_sz += size;
           // write the superframe only for the case when
           if (!ctx->output_cx_pkt_cb.output_cx_pkt)
@@ -1018,7 +1041,9 @@
           ctx->pending_cx_data = NULL;
           ctx->pending_cx_data_sz = 0;
           ctx->pending_frame_count = 0;
+#if !CONFIG_MISC_FIXES
           ctx->pending_frame_magnitude = 0;
+#endif
         } else {
           pkt.data.frame.buf = cx_data;
           pkt.data.frame.sz  = size;

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index e633691..ed5f4ca 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -241,11 +241,15 @@
 }
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-# the transform coefficients are held in 32-bit
-# values, so the assembler code for  vp9_block_error can no longer be used.
   add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
   specialize qw/vp9_block_error/;
 
+  add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+  specialize qw/vp9_highbd_block_error/, "$sse2_x86inc";
+
+  add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+  specialize qw/vp9_highbd_block_error_8bit/, "$sse2_x86inc";
+
   add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp/;
 
@@ -320,9 +324,6 @@
 
   # ENCODEMB INVOKE
 
-  add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-  specialize qw/vp9_highbd_block_error sse2/;
-
   add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_highbd_quantize_fp/;
 

diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 4cac388..678e312 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c

@@ -195,8 +195,8 @@
                                                          int mi_row,
                                                          int mi_col,
                                                          PICK_MODE_CONTEXT *ctx,
-                                                         int *motion_magnitude
-                                                         ) {
+                                                         int *motion_magnitude,
+                                                         int is_skin) {
   int mv_col, mv_row;
   int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
   MV_REFERENCE_FRAME frame;
@@ -214,6 +214,9 @@
 
   saved_mbmi = *mbmi;
 
+  if (is_skin && *motion_magnitude > 16)
+    return COPY_BLOCK;
+
   // If the best reference frame uses inter-prediction and there is enough of a
   // difference in sum-squared-error, use it.
   if (frame != INTRA_FRAME &&
@@ -313,18 +316,37 @@
                           int mi_row, int mi_col, BLOCK_SIZE bs,
                           PICK_MODE_CONTEXT *ctx) {
   int motion_magnitude = 0;
-  VP9_DENOISER_DECISION decision = FILTER_BLOCK;
+  VP9_DENOISER_DECISION decision = COPY_BLOCK;
   YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
   YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
   uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
   uint8_t *mc_avg_start = block_start(mc_avg.y_buffer, mc_avg.y_stride,
                                           mi_row, mi_col);
   struct buf_2d src = mb->plane[0].src;
+  int is_skin = 0;
+
+  if (bs <= BLOCK_16X16) {
+    // Take center pixel in block to determine is_skin.
+    const int y_width_shift = (4 << b_width_log2_lookup[bs]) >> 1;
+    const int y_height_shift = (4 << b_height_log2_lookup[bs]) >> 1;
+    const int uv_width_shift = y_width_shift >> 1;
+    const int uv_height_shift = y_height_shift >> 1;
+    const int stride = mb->plane[0].src.stride;
+    const int strideuv = mb->plane[1].src.stride;
+    const uint8_t ysource =
+      mb->plane[0].src.buf[y_height_shift * stride + y_width_shift];
+    const uint8_t usource =
+      mb->plane[1].src.buf[uv_height_shift * strideuv + uv_width_shift];
+    const uint8_t vsource =
+      mb->plane[2].src.buf[uv_height_shift * strideuv + uv_width_shift];
+    is_skin = vp9_skin_pixel(ysource, usource, vsource);
+  }
 
   decision = perform_motion_compensation(denoiser, mb, bs,
                                          denoiser->increase_denoising,
                                          mi_row, mi_col, ctx,
-                                         &motion_magnitude);
+                                         &motion_magnitude,
+                                         is_skin);
 
   if (decision == FILTER_BLOCK) {
     decision = vp9_denoiser_filter(src.buf, src.stride,

diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index c66fdf4..ec0b25e 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h

@@ -12,6 +12,7 @@
 #define VP9_ENCODER_DENOISER_H_
 
 #include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_skin_detection.h"
 #include "vpx_scale/yv12config.h"
 
 #ifdef __cplusplus

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 1818906..1944291 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -269,6 +269,71 @@
   *out_dist_sum = dist_sum << 4;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
+                                 const tran_low_t *dqcoeff,
+                                 intptr_t block_size,
+                                 int64_t *ssz, int bd) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
+  int shift = 2 * (bd - 8);
+  int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int64_t diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+    sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
+
+int64_t vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
+                                      const tran_low_t *dqcoeff,
+                                      intptr_t block_size,
+                                      int64_t *ssz) {
+  int i;
+  int32_t c, d;
+  int64_t error = 0, sqcoeff = 0;
+  int16_t diff;
+
+  const int32_t hi = 0x00007fff;
+  const int32_t lo = 0xffff8000;
+
+  for (i = 0; i < block_size; i++) {
+    c = coeff[i];
+    d = dqcoeff[i];
+
+    // Saturate to 16 bits
+    c = (c > hi) ? hi : ((c < lo) ? lo : c);
+    d = (d > hi) ? hi : ((d < lo) ? lo : d);
+
+    diff = d - c;
+    error +=  diff * diff;
+    sqcoeff += c * c;
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+
+  *ssz = sqcoeff;
+  return error;
+}
+
+static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff,
+                                               const tran_low_t *dqcoeff,
+                                               intptr_t block_size,
+                                               int64_t *ssz, int bd) {
+  if (bd == 8) {
+    return vp9_highbd_block_error_8bit(coeff, dqcoeff, block_size, ssz);
+  } else {
+    return vp9_highbd_block_error(coeff, dqcoeff, block_size, ssz, bd);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
                           intptr_t block_size, int64_t *ssz) {
   int i;
@@ -297,30 +362,6 @@
   return error;
 }
 
-#if CONFIG_VP9_HIGHBITDEPTH
-int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
-                                 const tran_low_t *dqcoeff,
-                                 intptr_t block_size,
-                                 int64_t *ssz, int bd) {
-  int i;
-  int64_t error = 0, sqcoeff = 0;
-  int shift = 2 * (bd - 8);
-  int rounding = shift > 0 ? 1 << (shift - 1) : 0;
-
-  for (i = 0; i < block_size; i++) {
-    const int64_t diff = coeff[i] - dqcoeff[i];
-    error +=  diff * diff;
-    sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
-  }
-  assert(error >= 0 && sqcoeff >= 0);
-  error = (error + rounding) >> shift;
-  sqcoeff = (sqcoeff + rounding) >> shift;
-
-  *ssz = sqcoeff;
-  return error;
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 /* The trailing '0' is a terminator which is used inside cost_coeffs() to
  * decide whether to include cost of a trailing EOB node or not (i.e. we
  * can skip this if the last coefficient in this transform block, e.g. the
@@ -430,8 +471,9 @@
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 #if CONFIG_VP9_HIGHBITDEPTH
   const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
-  *out_dist = vp9_highbd_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
-                                     &this_sse, bd) >> shift;
+  *out_dist = vp9_highbd_block_error_dispatch(coeff, dqcoeff,
+                                              16 << ss_txfrm_size,
+                                              &this_sse, bd) >> shift;
 #else
   *out_dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
                               &this_sse) >> shift;
@@ -831,7 +873,7 @@
             ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
                                  so->scan, so->neighbors,
                                  cpi->sf.use_fast_coef_costing);
-            distortion += vp9_highbd_block_error(
+            distortion += vp9_highbd_block_error_dispatch(
                 coeff, BLOCK_OFFSET(pd->dqcoeff, block),
                 16, &unused, xd->bd) >> 2;
             if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
@@ -929,8 +971,13 @@
           ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
                              so->scan, so->neighbors,
                              cpi->sf.use_fast_coef_costing);
+#if CONFIG_VP9_HIGHBITDEPTH
+          distortion += vp9_highbd_block_error_8bit(
+              coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >> 2;
+#else
           distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
                                         16, &unused) >> 2;
+#endif
           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
             goto next;
           vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
@@ -1368,6 +1415,9 @@
   k = i;
   for (idy = 0; idy < height / 4; ++idy) {
     for (idx = 0; idx < width / 4; ++idx) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
+#endif
       int64_t ssz, rd, rd1, rd2;
       tran_low_t* coeff;
 
@@ -1377,14 +1427,8 @@
                     coeff, 8);
       vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        thisdistortion += vp9_highbd_block_error(coeff,
-                                                 BLOCK_OFFSET(pd->dqcoeff, k),
-                                                 16, &ssz, xd->bd);
-      } else {
-        thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
-                                          16, &ssz);
-      }
+      thisdistortion += vp9_highbd_block_error_dispatch(
+          coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, bd);
 #else
       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
                                         16, &ssz);

diff --git a/vp9/encoder/vp9_skin_detection.c b/vp9/encoder/vp9_skin_detection.c
index aaa8ea0..c2763b7 100644
--- a/vp9/encoder/vp9_skin_detection.c
+++ b/vp9/encoder/vp9_skin_detection.c

@@ -98,12 +98,13 @@
       uint8_t ysource4 = src_y[(ypos + 1) * src_ystride + (ypos + 1)];
       uint8_t usource4 = src_u[(uvpos + 1) * src_uvstride + (uvpos  + 1)];
       uint8_t vsource4 = src_v[(uvpos + 1) * src_uvstride + (uvpos +  1)];
+      int is_skin = 0;
       if (mode_filter == 1) {
         ysource = (ysource + ysource2 + ysource3 + ysource4) >> 2;
         usource = (usource + usource2 + usource3 + usource4) >> 2;
         vsource = (vsource + vsource2 + vsource3 + vsource4) >> 2;
       }
-      const int is_skin = vp9_skin_pixel(ysource, usource, vsource);
+      is_skin = vp9_skin_pixel(ysource, usource, vsource);
       for (i = 0; i < y_bsize; i++) {
         for (j = 0; j < y_bsize; j++) {
           if (is_skin)

diff --git a/vp9/encoder/vp9_skin_detection.h b/vp9/encoder/vp9_skin_detection.h
index 3d4e737..0a87ef9 100644
--- a/vp9/encoder/vp9_skin_detection.h
+++ b/vp9/encoder/vp9_skin_detection.h

@@ -25,7 +25,8 @@
 
 #ifdef OUTPUT_YUV_SKINMAP
 // For viewing skin map on input source.
-void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file);
+void vp9_compute_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file);
+extern void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f);
 #endif
 
 #ifdef __cplusplus

diff --git a/vp9/encoder/x86/vp9_highbd_error_sse2.asm b/vp9/encoder/x86/vp9_highbd_error_sse2.asm
new file mode 100644
index 0000000..f3b8f01
--- /dev/null
+++ b/vp9/encoder/x86/vp9_highbd_error_sse2.asm

@@ -0,0 +1,98 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+ALIGN 16
+
+;
+; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
+;                                     intptr_t block_size, int64_t *ssz)
+;
+
+INIT_XMM sse2
+cglobal highbd_block_error_8bit, 3, 3, 8, uqc, dqc, size, ssz
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m6, m6                 ; ssz accumulator
+  pxor      m5, m5                 ; dedicated zero register
+  lea     uqcq, [uqcq+sizeq*4]
+  lea     dqcq, [dqcq+sizeq*4]
+  neg    sizeq
+
+  ALIGN 16
+
+.loop:
+  mova      m0, [dqcq+sizeq*4]
+  packssdw  m0, [dqcq+sizeq*4+mmsize]
+  mova      m2, [uqcq+sizeq*4]
+  packssdw  m2, [uqcq+sizeq*4+mmsize]
+
+  mova      m1, [dqcq+sizeq*4+mmsize*2]
+  packssdw  m1, [dqcq+sizeq*4+mmsize*3]
+  mova      m3, [uqcq+sizeq*4+mmsize*2]
+  packssdw  m3, [uqcq+sizeq*4+mmsize*3]
+
+  add    sizeq, mmsize
+
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+
+  psubw     m0, m2
+  pmaddwd   m2, m2
+  pmaddwd   m0, m0
+
+  psubw     m1, m3
+  pmaddwd   m3, m3
+  pmaddwd   m1, m1
+
+  ; accumulate in 64bit
+  punpckldq m7, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m7
+
+  punpckldq m7, m2, m5
+  punpckhdq m2, m5
+  paddq     m6, m7
+
+  punpckldq m7, m1, m5
+  punpckhdq m1, m5
+  paddq     m4, m7
+
+  punpckldq m7, m3, m5
+  punpckhdq m3, m5
+  paddq     m6, m7
+
+  paddq     m4, m0
+  paddq     m4, m1
+  paddq     m6, m2
+  paddq     m6, m3
+
+  jnz .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  movhlps   m7, m6
+  paddq     m4, m5
+  paddq     m6, m7
+
+%if ARCH_X86_64
+  movq    rax, m4
+  movq [sszq], m6
+%else
+  mov     eax, sszm
+  pshufd   m5, m4, 0x1
+  movq  [eax], m6
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 84b12d7..a2cbacf 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -100,8 +100,12 @@
 
 ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
+else
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
 endif
+endif
 
 ifeq ($(ARCH_X86_64),yes)
 ifeq ($(CONFIG_USE_X86INC),yes)

diff --git a/vpx_dsp/x86/quantize_ssse3_x86_64.asm b/vpx_dsp/x86/quantize_ssse3_x86_64.asm
index 2f3cadd..ca21539 100644
--- a/vpx_dsp/x86/quantize_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/quantize_ssse3_x86_64.asm

@@ -69,11 +69,9 @@
 %if CONFIG_VP9_HIGHBITDEPTH
   ; coeff stored as 32bit numbers & require 16bit numbers
   mova                            m9, [  coeffq+ncoeffq*4+ 0]
-  mova                            m6, [  coeffq+ncoeffq*4+16]
+  packssdw                        m9, [  coeffq+ncoeffq*4+16]
   mova                           m10, [  coeffq+ncoeffq*4+32]
-  mova                           m11, [  coeffq+ncoeffq*4+48]
-  packssdw                        m9, m6 ; m9 = c[i]
-  packssdw                       m10, m11 ; m10 = c[i]
+  packssdw                       m10, [  coeffq+ncoeffq*4+48]
 %else
   mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
@@ -171,11 +169,9 @@
 %if CONFIG_VP9_HIGHBITDEPTH
   ; pack coeff from 32bit to 16bit array
   mova                            m9, [  coeffq+ncoeffq*4+ 0]
-  mova                            m6, [  coeffq+ncoeffq*4+16]
+  packssdw                        m9, [  coeffq+ncoeffq*4+16]
   mova                           m10, [  coeffq+ncoeffq*4+32]
-  mova                           m11, [  coeffq+ncoeffq*4+48]
-  packssdw                        m9, m6 ; m9 = c[i]
-  packssdw                       m10, m11 ; m10 = c[i]
+  packssdw                       m10, [  coeffq+ncoeffq*4+48]
 %else
   mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]