Merge "Optimization of 8bit block error for high bitdepth"

diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index 6619b00..ad02c95 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c

@@ -170,8 +170,12 @@
 static void update_mv_probs(vpx_prob *p, int n, vpx_reader *r) {
   int i;
   for (i = 0; i < n; ++i)
+#if CONFIG_MISC_FIXES
+    vp10_diff_update_prob(r, &p[i]);
+#else
     if (vpx_read(r, MV_UPDATE_PROB))
       p[i] = (vpx_read_literal(r, 7) << 1) | 1;
+#endif
 }
 
 static void read_mv_probs(nmv_context *ctx, int allow_hp, vpx_reader *r) {

diff --git a/vp10/decoder/decoder.c b/vp10/decoder/decoder.c
index 23851af..03a81f5 100644
--- a/vp10/decoder/decoder.c
+++ b/vp10/decoder/decoder.c

@@ -459,6 +459,9 @@
   // an invalid bitstream and need to return an error.
 
   uint8_t marker;
+#if CONFIG_MISC_FIXES
+  size_t frame_sz_sum = 0;
+#endif
 
   assert(data_sz);
   marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1);
@@ -467,7 +470,7 @@
   if ((marker & 0xe0) == 0xc0) {
     const uint32_t frames = (marker & 0x7) + 1;
     const uint32_t mag = ((marker >> 3) & 0x3) + 1;
-    const size_t index_sz = 2 + mag * frames;
+    const size_t index_sz = 2 + mag * (frames - CONFIG_MISC_FIXES);
 
     // This chunk is marked as having a superframe index but doesn't have
     // enough data for it, thus it's an invalid superframe index.
@@ -498,13 +501,19 @@
         x = clear_buffer;
       }
 
-      for (i = 0; i < frames; ++i) {
+      for (i = 0; i < frames - CONFIG_MISC_FIXES; ++i) {
         uint32_t this_sz = 0;
 
         for (j = 0; j < mag; ++j)
           this_sz |= (*x++) << (j * 8);
         sizes[i] = this_sz;
+#if CONFIG_MISC_FIXES
+        frame_sz_sum += this_sz;
+#endif
       }
+#if CONFIG_MISC_FIXES
+      sizes[i] = data_sz - index_sz - frame_sz_sum;
+#endif
       *count = frames;
     }
   }

diff --git a/vp10/decoder/detokenize.c b/vp10/decoder/detokenize.c
index 2902ece..d39e3dc 100644
--- a/vp10/decoder/detokenize.c
+++ b/vp10/decoder/detokenize.c

@@ -163,26 +163,33 @@
         case CATEGORY5_TOKEN:
           val = CAT5_MIN_VAL + read_coeff(cat5_prob, 5, r);
           break;
-        case CATEGORY6_TOKEN:
+        case CATEGORY6_TOKEN: {
+#if CONFIG_MISC_FIXES
+          const int skip_bits = TX_SIZES - 1 - tx_size;
+#else
+          const int skip_bits = 0;
+#endif
+          const uint8_t *cat6p = cat6_prob + skip_bits;
 #if CONFIG_VP9_HIGHBITDEPTH
           switch (xd->bd) {
             case VPX_BITS_8:
-              val = CAT6_MIN_VAL + read_coeff(cat6_prob, 14, r);
+              val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, r);
               break;
             case VPX_BITS_10:
-              val = CAT6_MIN_VAL + read_coeff(cat6_prob, 16, r);
+              val = CAT6_MIN_VAL + read_coeff(cat6p, 16 - skip_bits, r);
               break;
             case VPX_BITS_12:
-              val = CAT6_MIN_VAL + read_coeff(cat6_prob, 18, r);
+              val = CAT6_MIN_VAL + read_coeff(cat6p, 18 - skip_bits, r);
               break;
             default:
               assert(0);
               return -1;
           }
 #else
-          val = CAT6_MIN_VAL + read_coeff(cat6_prob, 14, r);
+          val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, r);
 #endif
           break;
+        }
       }
     }
     v = (val * dqv) >> dq_shift;

diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 940ae88..1661fbd 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c

@@ -122,8 +122,11 @@
 
 static void pack_mb_tokens(vpx_writer *w,
                            TOKENEXTRA **tp, const TOKENEXTRA *const stop,
-                           vpx_bit_depth_t bit_depth) {
+                           vpx_bit_depth_t bit_depth, const TX_SIZE tx) {
   TOKENEXTRA *p = *tp;
+#if !CONFIG_MISC_FIXES
+  (void) tx;
+#endif
 
   while (p < stop && p->token != EOSB_TOKEN) {
     const int t = p->token;
@@ -171,6 +174,12 @@
 
     if (b->base_val) {
       const int e = p->extra, l = b->len;
+#if CONFIG_MISC_FIXES
+      int skip_bits =
+          (b->base_val == CAT6_MIN_VAL) ? TX_SIZES - 1 - tx : 0;
+#else
+      int skip_bits = 0;
+#endif
 
       if (l) {
         const unsigned char *pb = b->prob;
@@ -180,7 +189,12 @@
 
         do {
           const int bb = (v >> --n) & 1;
-          vpx_write(w, bb, pb[i >> 1]);
+          if (skip_bits) {
+            skip_bits--;
+            assert(!bb);
+          } else {
+            vpx_write(w, bb, pb[i >> 1]);
+          }
           i = b->tree[i + bb];
         } while (n);
       }
@@ -190,7 +204,7 @@
     ++p;
   }
 
-  *tp = p + (p->token == EOSB_TOKEN);
+  *tp = p;
 }
 
 static void write_segment_id(vpx_writer *w, const struct segmentation *seg,
@@ -382,6 +396,7 @@
   const VP10_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   MODE_INFO *m;
+  int plane;
 
   xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
   m = xd->mi[0];
@@ -398,8 +413,16 @@
     pack_inter_mode_mvs(cpi, m, w);
   }
 
-  assert(*tok < tok_end);
-  pack_mb_tokens(w, tok, tok_end, cm->bit_depth);
+  if (!m->mbmi.skip) {
+    assert(*tok < tok_end);
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      TX_SIZE tx = plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane])
+                         : m->mbmi.tx_size;
+      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
+      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+      (*tok)++;
+    }
+  }
 }
 
 static void write_partition(const VP10_COMMON *const cm,

diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index 019e5b1..ce1530c 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c

@@ -1356,9 +1356,6 @@
 
   if (output_enabled) {
     update_stats(&cpi->common, td);
-
-    (*tp)->token = EOSB_TOKEN;
-    (*tp)++;
   }
 }
 

diff --git a/vp10/encoder/encodemv.c b/vp10/encoder/encodemv.c
index ca2de1f..0736c65 100644
--- a/vp10/encoder/encodemv.c
+++ b/vp10/encoder/encodemv.c

@@ -15,6 +15,7 @@
 
 #include "vp10/encoder/cost.h"
 #include "vp10/encoder/encodemv.h"
+#include "vp10/encoder/subexp.h"
 
 #include "vpx_dsp/vpx_dsp_common.h"
 
@@ -134,8 +135,12 @@
   }
 }
 
-static int update_mv(vpx_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
-                     vpx_prob upd_p) {
+static void update_mv(vpx_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
+                      vpx_prob upd_p) {
+#if CONFIG_MISC_FIXES
+  (void) upd_p;
+  vp10_cond_prob_diff_update(w, cur_p, ct);
+#else
   const vpx_prob new_p = get_binary_prob(ct[0], ct[1]) | 1;
   const int update = cost_branch256(ct, *cur_p) + vp10_cost_zero(upd_p) >
                      cost_branch256(ct, new_p) + vp10_cost_one(upd_p) + 7 * 256;
@@ -144,7 +149,7 @@
     *cur_p = new_p;
     vpx_write_literal(w, new_p >> 1, 7);
   }
-  return update;
+#endif
 }
 
 static void write_mv_update(const vpx_tree_index *tree,

diff --git a/vp10/encoder/tokenize.c b/vp10/encoder/tokenize.c
index af915fe..cbebd5a 100644
--- a/vp10/encoder/tokenize.c
+++ b/vp10/encoder/tokenize.c

@@ -628,8 +628,16 @@
   }
 
   if (!dry_run) {
+    int plane;
+
     td->counts->skip[ctx][0] += skip_inc;
-    vp10_foreach_transformed_block(xd, bsize, tokenize_b, &arg);
+
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      vp10_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b,
+                                              &arg);
+      (*t)->token = EOSB_TOKEN;
+      (*t)++;
+    }
   } else {
     vp10_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
   }

diff --git a/vp10/vp10_cx_iface.c b/vp10/vp10_cx_iface.c
index 304f74e..409ed1c 100644
--- a/vp10/vp10_cx_iface.c
+++ b/vp10/vp10_cx_iface.c

@@ -91,7 +91,9 @@
   size_t                  pending_cx_data_sz;
   int                     pending_frame_count;
   size_t                  pending_frame_sizes[8];
+#if !CONFIG_MISC_FIXES
   size_t                  pending_frame_magnitude;
+#endif
   vpx_image_t             preview_img;
   vpx_enc_frame_flags_t   next_frame_flags;
   vp8_postproc_cfg_t      preview_ppcfg;
@@ -781,24 +783,39 @@
   uint8_t marker = 0xc0;
   unsigned int mask;
   int mag, index_sz;
+#if CONFIG_MISC_FIXES
+  int i;
+  size_t max_frame_sz = 0;
+#endif
 
   assert(ctx->pending_frame_count);
   assert(ctx->pending_frame_count <= 8);
 
   // Add the number of frames to the marker byte
   marker |= ctx->pending_frame_count - 1;
+#if CONFIG_MISC_FIXES
+  for (i = 0; i < ctx->pending_frame_count - 1; i++) {
+    const size_t frame_sz = (unsigned int) ctx->pending_frame_sizes[i];
+    max_frame_sz = frame_sz > max_frame_sz ? frame_sz : max_frame_sz;
+  }
+#endif
 
   // Choose the magnitude
   for (mag = 0, mask = 0xff; mag < 4; mag++) {
+#if CONFIG_MISC_FIXES
+    if (max_frame_sz <= mask)
+      break;
+#else
     if (ctx->pending_frame_magnitude < mask)
       break;
+#endif
     mask <<= 8;
     mask |= 0xff;
   }
   marker |= mag << 3;
 
   // Write the index
-  index_sz = 2 + (mag + 1) * ctx->pending_frame_count;
+  index_sz = 2 + (mag + 1) * (ctx->pending_frame_count - CONFIG_MISC_FIXES);
   if (ctx->pending_cx_data_sz + index_sz < ctx->cx_data_sz) {
     uint8_t *x = ctx->pending_cx_data + ctx->pending_cx_data_sz;
     int i, j;
@@ -818,7 +835,7 @@
 #endif
 
     *x++ = marker;
-    for (i = 0; i < ctx->pending_frame_count; i++) {
+    for (i = 0; i < ctx->pending_frame_count - CONFIG_MISC_FIXES; i++) {
       unsigned int this_sz = (unsigned int)ctx->pending_frame_sizes[i];
 
       for (j = 0; j <= mag; j++) {
@@ -974,7 +991,9 @@
             ctx->pending_cx_data = cx_data;
           ctx->pending_cx_data_sz += size;
           ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
+#if !CONFIG_MISC_FIXES
           ctx->pending_frame_magnitude |= size;
+#endif
           cx_data += size;
           cx_data_sz -= size;
 
@@ -991,7 +1010,9 @@
             ctx->pending_cx_data = NULL;
             ctx->pending_cx_data_sz = 0;
             ctx->pending_frame_count = 0;
+#if !CONFIG_MISC_FIXES
             ctx->pending_frame_magnitude = 0;
+#endif
             ctx->output_cx_pkt_cb.output_cx_pkt(
                 &pkt, ctx->output_cx_pkt_cb.user_priv);
           }
@@ -1008,7 +1029,9 @@
 
         if (ctx->pending_cx_data) {
           ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
+#if !CONFIG_MISC_FIXES
           ctx->pending_frame_magnitude |= size;
+#endif
           ctx->pending_cx_data_sz += size;
           // write the superframe only for the case when
           if (!ctx->output_cx_pkt_cb.output_cx_pkt)
@@ -1018,7 +1041,9 @@
           ctx->pending_cx_data = NULL;
           ctx->pending_cx_data_sz = 0;
           ctx->pending_frame_count = 0;
+#if !CONFIG_MISC_FIXES
           ctx->pending_frame_magnitude = 0;
+#endif
         } else {
           pkt.data.frame.buf = cx_data;
           pkt.data.frame.sz  = size;

diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 4cac388..678e312 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c

@@ -195,8 +195,8 @@
                                                          int mi_row,
                                                          int mi_col,
                                                          PICK_MODE_CONTEXT *ctx,
-                                                         int *motion_magnitude
-                                                         ) {
+                                                         int *motion_magnitude,
+                                                         int is_skin) {
   int mv_col, mv_row;
   int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
   MV_REFERENCE_FRAME frame;
@@ -214,6 +214,9 @@
 
   saved_mbmi = *mbmi;
 
+  if (is_skin && *motion_magnitude > 16)
+    return COPY_BLOCK;
+
   // If the best reference frame uses inter-prediction and there is enough of a
   // difference in sum-squared-error, use it.
   if (frame != INTRA_FRAME &&
@@ -313,18 +316,37 @@
                           int mi_row, int mi_col, BLOCK_SIZE bs,
                           PICK_MODE_CONTEXT *ctx) {
   int motion_magnitude = 0;
-  VP9_DENOISER_DECISION decision = FILTER_BLOCK;
+  VP9_DENOISER_DECISION decision = COPY_BLOCK;
   YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
   YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
   uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
   uint8_t *mc_avg_start = block_start(mc_avg.y_buffer, mc_avg.y_stride,
                                           mi_row, mi_col);
   struct buf_2d src = mb->plane[0].src;
+  int is_skin = 0;
+
+  if (bs <= BLOCK_16X16) {
+    // Take center pixel in block to determine is_skin.
+    const int y_width_shift = (4 << b_width_log2_lookup[bs]) >> 1;
+    const int y_height_shift = (4 << b_height_log2_lookup[bs]) >> 1;
+    const int uv_width_shift = y_width_shift >> 1;
+    const int uv_height_shift = y_height_shift >> 1;
+    const int stride = mb->plane[0].src.stride;
+    const int strideuv = mb->plane[1].src.stride;
+    const uint8_t ysource =
+      mb->plane[0].src.buf[y_height_shift * stride + y_width_shift];
+    const uint8_t usource =
+      mb->plane[1].src.buf[uv_height_shift * strideuv + uv_width_shift];
+    const uint8_t vsource =
+      mb->plane[2].src.buf[uv_height_shift * strideuv + uv_width_shift];
+    is_skin = vp9_skin_pixel(ysource, usource, vsource);
+  }
 
   decision = perform_motion_compensation(denoiser, mb, bs,
                                          denoiser->increase_denoising,
                                          mi_row, mi_col, ctx,
-                                         &motion_magnitude);
+                                         &motion_magnitude,
+                                         is_skin);
 
   if (decision == FILTER_BLOCK) {
     decision = vp9_denoiser_filter(src.buf, src.stride,

diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index c66fdf4..ec0b25e 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h

@@ -12,6 +12,7 @@
 #define VP9_ENCODER_DENOISER_H_
 
 #include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_skin_detection.h"
 #include "vpx_scale/yv12config.h"
 
 #ifdef __cplusplus

diff --git a/vp9/encoder/vp9_skin_detection.c b/vp9/encoder/vp9_skin_detection.c
index aaa8ea0..c2763b7 100644
--- a/vp9/encoder/vp9_skin_detection.c
+++ b/vp9/encoder/vp9_skin_detection.c

@@ -98,12 +98,13 @@
       uint8_t ysource4 = src_y[(ypos + 1) * src_ystride + (ypos + 1)];
       uint8_t usource4 = src_u[(uvpos + 1) * src_uvstride + (uvpos  + 1)];
       uint8_t vsource4 = src_v[(uvpos + 1) * src_uvstride + (uvpos +  1)];
+      int is_skin = 0;
       if (mode_filter == 1) {
         ysource = (ysource + ysource2 + ysource3 + ysource4) >> 2;
         usource = (usource + usource2 + usource3 + usource4) >> 2;
         vsource = (vsource + vsource2 + vsource3 + vsource4) >> 2;
       }
-      const int is_skin = vp9_skin_pixel(ysource, usource, vsource);
+      is_skin = vp9_skin_pixel(ysource, usource, vsource);
       for (i = 0; i < y_bsize; i++) {
         for (j = 0; j < y_bsize; j++) {
           if (is_skin)

diff --git a/vp9/encoder/vp9_skin_detection.h b/vp9/encoder/vp9_skin_detection.h
index 3d4e737..0a87ef9 100644
--- a/vp9/encoder/vp9_skin_detection.h
+++ b/vp9/encoder/vp9_skin_detection.h

@@ -25,7 +25,8 @@
 
 #ifdef OUTPUT_YUV_SKINMAP
 // For viewing skin map on input source.
-void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file);
+void vp9_compute_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file);
+extern void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f);
 #endif
 
 #ifdef __cplusplus