Merge "vp9_ethread: add parallel loopfilter"
diff --git a/configure b/configure
index 4509697..f5dd7fc 100755
--- a/configure
+++ b/configure
@@ -58,6 +58,7 @@
   ${toggle_postproc_visualizer}   macro block / block level visualizers
   ${toggle_multi_res_encoding}    enable multiple-resolution encoding
   ${toggle_temporal_denoising}    enable temporal denoising and disable the spatial denoiser
+  ${toggle_vp9_highbitdepth}      enable 10/12 bit support in VP9
   ${toggle_vp9_temporal_denoising}
                                   enable vp9 temporal denoising
   ${toggle_webm_io}               enable input from and output to WebM container
diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c
index b9da8c9..b1ad837 100644
--- a/vp9/encoder/arm/neon/vp9_variance_neon.c
+++ b/vp9/encoder/arm/neon/vp9_variance_neon.c
@@ -50,9 +50,10 @@
   return vget_lane_s32(c, 0);
 }
 
+// w * h must be less than 2048 or local variable v_sum may overflow.
 static void variance_neon_w8(const uint8_t *a, int a_stride,
                              const uint8_t *b, int b_stride,
-                             int w, int h, unsigned int *sse, int *sum) {
+                             int w, int h, uint32_t *sse, int *sum) {
   int i, j;
   int16x8_t v_sum = vdupq_n_s16(0);
   int32x4_t v_sse_lo = vdupq_n_s32(0);
@@ -215,25 +216,56 @@
 unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride,
                                     const uint8_t *b, int b_stride,
                                     unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight64, sse, &sum);
-  return *sse - (((int64_t)sum * sum) >> 11);  // >> 11 = / 32 * 64
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, &sse1, &sum1);
+  variance_neon_w8(a + (kHeight32 * a_stride), a_stride,
+                   b + (kHeight32 * b_stride), b_stride, kWidth32, kHeight32,
+                   &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
 }
 
 unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride,
                                     const uint8_t *b, int b_stride,
                                     unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight32, sse, &sum);
-  return *sse - (((int64_t)sum * sum) >> 11);  // >> 11 = / 64 * 32
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight16, &sse1, &sum1);
+  variance_neon_w8(a + (kHeight16 * a_stride), a_stride,
+                   b + (kHeight16 * b_stride), b_stride, kWidth64, kHeight16,
+                   &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
 }
 
 unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride,
                                     const uint8_t *b, int b_stride,
                                     unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight64, sse, &sum);
-  return *sse - (((int64_t)sum * sum) >> 12);  // >> 12 = / 64 * 64
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+
+  variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight16, &sse1, &sum1);
+  variance_neon_w8(a + (kHeight16 * a_stride), a_stride,
+                   b + (kHeight16 * b_stride), b_stride, kWidth64, kHeight16,
+                   &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (kHeight16 * 2 * a_stride), a_stride,
+                   b + (kHeight16 * 2 * b_stride), b_stride,
+                   kWidth64, kHeight16, &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (kHeight16 * 3 * a_stride), a_stride,
+                   b + (kHeight16 * 3 * b_stride), b_stride,
+                   kWidth64, kHeight16, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 12);  // >> 12 = / 64 * 64
 }
 
 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index ab85337..a7aaff0 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -352,6 +352,7 @@
   int r;
   const uint8_t *srcbuf = src.y_buffer;
   uint8_t *destbuf = dest.y_buffer;
+
   assert(dest.y_width == src.y_width);
   assert(dest.y_height == src.y_height);
 
@@ -362,13 +363,13 @@
   }
 }
 
-static void swap_frame_buffer(YV12_BUFFER_CONFIG dest,
-                              YV12_BUFFER_CONFIG src) {
-  uint8_t *tmp_buf = dest.y_buffer;
-  assert(dest.y_width == src.y_width);
-  assert(dest.y_height == src.y_height);
-  dest.y_buffer = src.y_buffer;
-  src.y_buffer = tmp_buf;
+static void swap_frame_buffer(YV12_BUFFER_CONFIG *dest,
+                              YV12_BUFFER_CONFIG *src) {
+  uint8_t *tmp_buf = dest->y_buffer;
+  assert(dest->y_width == src->y_width);
+  assert(dest->y_height == src->y_height);
+  dest->y_buffer = src->y_buffer;
+  src->y_buffer = tmp_buf;
 }
 
 void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
@@ -387,16 +388,16 @@
 
   /* For non key frames */
   if (refresh_alt_ref_frame) {
-    swap_frame_buffer(denoiser->running_avg_y[ALTREF_FRAME],
-                      denoiser->running_avg_y[INTRA_FRAME]);
+    swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
   }
   if (refresh_golden_frame) {
-    swap_frame_buffer(denoiser->running_avg_y[GOLDEN_FRAME],
-                      denoiser->running_avg_y[INTRA_FRAME]);
+    swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
   }
   if (refresh_last_frame) {
-    swap_frame_buffer(denoiser->running_avg_y[LAST_FRAME],
-                      denoiser->running_avg_y[INTRA_FRAME]);
+    swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
   }
 }
 
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index ef6174e..82bce37 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -344,7 +344,8 @@
   buf = vp9_lookahead_peek(cpi->lookahead, 0);
   if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2 &&
       cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE &&
-      lc->rc.frames_to_key != 0 && !(buf->flags & VPX_EFLAG_FORCE_KF)) {
+      lc->rc.frames_to_key != 0 &&
+      !(buf != NULL && (buf->flags & VPX_EFLAG_FORCE_KF))) {
     if ((cpi->svc.number_temporal_layers > 1 &&
          cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1) ||
         (cpi->svc.number_spatial_layers > 1 &&