Merge "Resolve a couple of TODOs in firstpass.c"
diff --git a/test/minmax_test.cc b/test/minmax_test.cc
new file mode 100644
index 0000000..dbe4342
--- /dev/null
+++ b/test/minmax_test.cc
@@ -0,0 +1,132 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+
+namespace {
+
+using ::libvpx_test::ACMRandom;
+
+typedef void (*MinMaxFunc)(const uint8_t *a, int a_stride,
+                           const uint8_t *b, int b_stride,
+                           int *min, int *max);
+
+class MinMaxTest : public ::testing::TestWithParam<MinMaxFunc> {
+ public:
+  virtual void SetUp() {
+    mm_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  MinMaxFunc mm_func_;
+  ACMRandom rnd_;
+};
+
+void reference_minmax(const uint8_t *a, int a_stride,
+                      const uint8_t *b, int b_stride,
+                      int *min_ret, int *max_ret) {
+  int min = 255;
+  int max = 0;
+  for (int i = 0; i < 8; i++) {
+    for (int j = 0; j < 8; j++) {
+      const int diff = abs(a[i * a_stride + j] - b[i * b_stride + j]);
+      if (min > diff) min = diff;
+      if (max < diff) max = diff;
+    }
+  }
+
+  *min_ret = min;
+  *max_ret = max;
+}
+
+TEST_P(MinMaxTest, MinValue) {
+  for (int i = 0; i < 64; i++) {
+    uint8_t a[64], b[64];
+    memset(a, 0, sizeof(a));
+    memset(b, 255, sizeof(b));
+    b[i] = i;  // Set a minimum difference of i.
+
+    int min, max;
+    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+    EXPECT_EQ(255, max);
+    EXPECT_EQ(i, min);
+  }
+}
+
+TEST_P(MinMaxTest, MaxValue) {
+  for (int i = 0; i < 64; i++) {
+    uint8_t a[64], b[64];
+    memset(a, 0, sizeof(a));
+    memset(b, 0, sizeof(b));
+    b[i] = i;  // Set a maximum difference of i.
+
+    int min, max;
+    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+    EXPECT_EQ(i, max);
+    EXPECT_EQ(0, min);
+  }
+}
+
+TEST_P(MinMaxTest, CompareReference) {
+  uint8_t a[64], b[64];
+  for (int j = 0; j < 64; j++) {
+    a[j] = rnd_.Rand8();
+    b[j] = rnd_.Rand8();
+  }
+
+  int min_ref, max_ref, min, max;
+  reference_minmax(a, 8, b, 8, &min_ref, &max_ref);
+  ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+  EXPECT_EQ(max_ref, max);
+  EXPECT_EQ(min_ref, min);
+}
+
+TEST_P(MinMaxTest, CompareReferenceAndVaryStride) {
+  uint8_t a[8 * 64], b[8 * 64];
+  for (int i = 0; i < 8 * 64; i++) {
+    a[i] = rnd_.Rand8();
+    b[i] = rnd_.Rand8();
+  }
+  for (int a_stride = 8; a_stride <= 64; a_stride += 8) {
+    for (int b_stride = 8; b_stride <= 64; b_stride += 8) {
+      int min_ref, max_ref, min, max;
+      reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref);
+      ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max));
+      EXPECT_EQ(max_ref, max) << "when a_stride = " << a_stride
+                              << " and b_stride = " << b_stride;;
+      EXPECT_EQ(min_ref, min) << "when a_stride = " << a_stride
+                              << " and b_stride = " << b_stride;;
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, MinMaxTest,
+                        ::testing::Values(&vpx_minmax_8x8_sse2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, MinMaxTest,
+                        ::testing::Values(&vpx_minmax_8x8_neon));
+#endif
+
+}  // namespace
diff --git a/test/test.mk b/test/test.mk
index d28ab11..7c22ca5 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -144,6 +144,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 2d98e77..4614625 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -1072,8 +1072,6 @@
       }
       break;
   }
-
-  vp9_adjust_mask(cm, mi_row, mi_col, lfm);
 }
 
 static void filter_selectively_vert(uint8_t *s, int pitch,
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 8b70062..42d456e 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -21,12 +21,6 @@
 #include "vp9/encoder/vp9_denoiser.h"
 #include "vp9/encoder/vp9_encoder.h"
 
-/* The VP9 denoiser is similar to that of the VP8 denoiser. While
- * choosing the motion vectors / reference frames, the denoiser is run, and if
- * it did not modify the signal to much, the denoised block is copied to the
- * signal.
- */
-
 #ifdef OUTPUT_YUV_DENOISED
 static void make_grayscale(YV12_BUFFER_CONFIG *yuv);
 #endif
@@ -49,16 +43,19 @@
 }
 
 static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
-  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 60 : 40);
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 80 : 40);
 }
 
 static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
                            int motion_magnitude) {
   if (motion_magnitude >
       noise_motion_thresh(bs, increase_denoising)) {
-    return 0;
+    if (increase_denoising)
+      return (1 << num_pels_log2_lookup[bs]) << 2;
+    else
+      return 0;
   } else {
-    return (1 << num_pels_log2_lookup[bs]) * 20;
+    return (1 << num_pels_log2_lookup[bs]) << 4;
   }
 }
 
@@ -222,6 +219,7 @@
   // If the best reference frame uses inter-prediction and there is enough of a
   // difference in sum-squared-error, use it.
   if (frame != INTRA_FRAME &&
+      ctx->newmv_sse != UINT_MAX &&
       sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) {
     mi->ref_frame[0] = ctx->best_reference_frame;
     mi->mode = ctx->best_sse_inter_mode;
@@ -496,12 +494,12 @@
   ctx->zeromv_sse = UINT_MAX;
   ctx->newmv_sse = UINT_MAX;
   ctx->zeromv_lastref_sse = UINT_MAX;
+  ctx->best_sse_mv.as_int = 0;
 }
 
 void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse,
                                      PREDICTION_MODE mode,
                                      PICK_MODE_CONTEXT *ctx) {
-  // TODO(tkopp): Use both MVs if possible
   if (mi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) {
     ctx->zeromv_sse = sse;
     ctx->best_zeromv_reference_frame = mi->ref_frame[0];
diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c
index a96b912..d4fee6f 100644
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -128,6 +128,14 @@
       ne->last_h = cm->height;
     }
     return;
+  } else if (cpi->rc.avg_frame_low_motion < 50) {
+    // Force noise estimation to 0 and denoiser off if content has high motion.
+    ne->level = kLowLow;
+#if CONFIG_VP9_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity > 0)
+      vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
+#endif
+    return;
   } else {
     int num_samples = 0;
     uint64_t avg_est = 0;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 3bd1753..d53e60a 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -338,6 +338,7 @@
   rc->total_target_bits = 0;
   rc->total_target_vs_actual = 0;
   rc->avg_intersize_gfint = 0;
+  rc->avg_frame_low_motion = 0;
 
   rc->frames_since_key = 8;  // Sensible default for first frame.
   rc->this_key_frame_forced = 0;
@@ -1334,6 +1335,26 @@
   }
 }
 
+static void compute_frame_low_motion(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  int mi_row, mi_col;
+  MODE_INFO **mi = cm->mi_grid_visible;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int rows = cm->mi_rows, cols = cm->mi_cols;
+  int cnt_zeromv = 0;
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      if (abs(mi[0]->mv[0].as_mv.row) < 16 &&
+          abs(mi[0]->mv[0].as_mv.col) < 16)
+        cnt_zeromv++;
+      mi++;
+    }
+    mi += 8;
+  }
+  cnt_zeromv = 100 * cnt_zeromv / (rows * cols);
+  rc->avg_frame_low_motion = (3 * rc->avg_frame_low_motion + cnt_zeromv) >> 2;
+}
+
 void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   const VP9_COMMON *const cm = &cpi->common;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
@@ -1420,10 +1441,6 @@
 
   rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
 
-  if (!cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {
-    rc->avg_intersize_gfint += rc->projected_frame_size;
-  }
-
   if (!cpi->use_svc || is_two_pass_svc(cpi)) {
     if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame &&
         (cm->frame_type != KEY_FRAME))
@@ -1447,6 +1464,13 @@
         rc->next_frame_size_selector != rc->frame_size_selector;
     rc->frame_size_selector = rc->next_frame_size_selector;
   }
+
+  if (oxcf->pass == 0) {
+    if (cm->frame_type != KEY_FRAME)
+      compute_frame_low_motion(cpi);
+    if (!cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame)
+      rc->avg_intersize_gfint += rc->projected_frame_size;
+  }
 }
 
 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
@@ -1507,6 +1531,7 @@
   if (rc->frames_till_gf_update_due == 0) {
     rc->avg_intersize_gfint =
         rc->avg_intersize_gfint / (rc->baseline_gf_interval + 1);
+    rc->gfu_boost = DEFAULT_GF_BOOST;
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0) {
       vp9_cyclic_refresh_set_golden_update(cpi);
     } else {
@@ -1523,6 +1548,11 @@
           rc->avg_frame_qindex[INTER_FRAME] > (7 * rc->worst_quality) >> 3 &&
           rc->avg_intersize_gfint > (5 * rc->avg_frame_bandwidth) >> 1) {
           rc->baseline_gf_interval = (3 * rc->baseline_gf_interval) >> 1;
+      } else if (cm->current_video_frame > 30 &&
+                 rc->avg_frame_low_motion < 20) {
+        // Decrease boost and gf interval for high motion case.
+        rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
+        rc->baseline_gf_interval = VPXMIN(6, rc->baseline_gf_interval >> 1);
       }
     }
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
@@ -1535,7 +1565,6 @@
     }
     cpi->refresh_golden_frame = 1;
     rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
-    rc->gfu_boost = DEFAULT_GF_BOOST;
     rc->avg_intersize_gfint = 0;
   }
   if (cm->frame_type == KEY_FRAME)
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 7b6f165..eef1940 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -163,6 +163,7 @@
   int high_source_sad;
   int count_last_scene_change;
   int avg_intersize_gfint;
+  int avg_frame_low_motion;
 } RATE_CONTROL;
 
 struct VP9_COMP;
diff --git a/vp9/encoder/vp9_skin_detection.c b/vp9/encoder/vp9_skin_detection.c
index ff0dfce..268aa91 100644
--- a/vp9/encoder/vp9_skin_detection.c
+++ b/vp9/encoder/vp9_skin_detection.c
@@ -112,7 +112,6 @@
 void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
   int i, j, mi_row, mi_col, num_bl;
   VP9_COMMON *const cm = &cpi->common;
-  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   uint8_t *y;
   const uint8_t *src_y = cpi->Source->y_buffer;
   const uint8_t *src_u = cpi->Source->u_buffer;
@@ -166,19 +165,17 @@
       } else {
         int block_size = BLOCK_8X8;
         int consec_zeromv = 0;
-        if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
-          int bl_index = mi_row * cm->mi_cols + mi_col;
-          int bl_index1 = bl_index + 1;
-          int bl_index2 = bl_index + cm->mi_cols;
-          int bl_index3 = bl_index2 + 1;
-          if (y_bsize == 8)
-            consec_zeromv = cr->consec_zero_mv[bl_index];
-          else
-            consec_zeromv = VPXMIN(cr->consec_zero_mv[bl_index],
-                                     VPXMIN(cr->consec_zero_mv[bl_index1],
-                                     VPXMIN(cr->consec_zero_mv[bl_index2],
-                                     cr->consec_zero_mv[bl_index3])));
-        }
+        int bl_index = mi_row * cm->mi_cols + mi_col;
+        int bl_index1 = bl_index + 1;
+        int bl_index2 = bl_index + cm->mi_cols;
+        int bl_index3 = bl_index2 + 1;
+        if (y_bsize == 8)
+          consec_zeromv = cpi->consec_zero_mv[bl_index];
+        else
+          consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index],
+                                 VPXMIN(cpi->consec_zero_mv[bl_index1],
+                                 VPXMIN(cpi->consec_zero_mv[bl_index2],
+                                 cpi->consec_zero_mv[bl_index3])));
         if (y_bsize == 16)
           block_size = BLOCK_16X16;
         is_skin  = vp9_compute_skin_block(src_y, src_u, src_v, src_ystride,
diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c
index d054c41..e52958c 100644
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -197,3 +197,60 @@
     return s - ((t * t) >> shift_factor);
   }
 }
+
+void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride,
+                         const uint8_t *b, int b_stride,
+                         int *min, int *max) {
+  // Load and concatenate.
+  const uint8x16_t a01 = vcombine_u8(vld1_u8(a),
+                                     vld1_u8(a + a_stride));
+  const uint8x16_t a23 = vcombine_u8(vld1_u8(a + 2 * a_stride),
+                                     vld1_u8(a + 3 * a_stride));
+  const uint8x16_t a45 = vcombine_u8(vld1_u8(a + 4 * a_stride),
+                                     vld1_u8(a + 5 * a_stride));
+  const uint8x16_t a67 = vcombine_u8(vld1_u8(a + 6 * a_stride),
+                                     vld1_u8(a + 7 * a_stride));
+
+  const uint8x16_t b01 = vcombine_u8(vld1_u8(b),
+                                     vld1_u8(b + b_stride));
+  const uint8x16_t b23 = vcombine_u8(vld1_u8(b + 2 * b_stride),
+                                     vld1_u8(b + 3 * b_stride));
+  const uint8x16_t b45 = vcombine_u8(vld1_u8(b + 4 * b_stride),
+                                     vld1_u8(b + 5 * b_stride));
+  const uint8x16_t b67 = vcombine_u8(vld1_u8(b + 6 * b_stride),
+                                     vld1_u8(b + 7 * b_stride));
+
+  // Absolute difference.
+  const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
+  const uint8x16_t ab23_diff = vabdq_u8(a23, b23);
+  const uint8x16_t ab45_diff = vabdq_u8(a45, b45);
+  const uint8x16_t ab67_diff = vabdq_u8(a67, b67);
+
+  // Max values between the Q vectors.
+  const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff);
+  const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff);
+  const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff);
+  const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff);
+
+  const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
+  const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
+
+  // Split to D and start doing pairwise.
+  uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
+  uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
+
+  // Enough runs of vpmax/min propogate the max/min values to every position.
+  ab_max = vpmax_u8(ab_max, ab_max);
+  ab_min = vpmin_u8(ab_min, ab_min);
+
+  ab_max = vpmax_u8(ab_max, ab_max);
+  ab_min = vpmin_u8(ab_min, ab_min);
+
+  ab_max = vpmax_u8(ab_max, ab_max);
+  ab_min = vpmin_u8(ab_min, ab_min);
+
+  *min = *max = 0;  // Clear high bits
+  // Store directly to avoid costly neon->gpr transfer.
+  vst1_lane_u8((uint8_t *)max, ab_max, 0);
+  vst1_lane_u8((uint8_t *)min, ab_min, 0);
+}
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 2b13192..9ea80a0 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1014,7 +1014,7 @@
   specialize qw/vpx_avg_4x4 sse2 neon msa/;
 
   add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-  specialize qw/vpx_minmax_8x8 sse2/;
+  specialize qw/vpx_minmax_8x8 sse2 neon/;
 
   add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
   specialize qw/vpx_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc";