Merge "mips msa vp9 block error optimization"
diff --git a/build/make/Makefile b/build/make/Makefile
index b56b490..f1b1cca 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -22,8 +22,10 @@
 exampletest: .DEFAULT
 install:: .DEFAULT
 test:: .DEFAULT
+test-no-data-check:: .DEFAULT
 testdata:: .DEFAULT
 utiltest: .DEFAULT
+exampletest-no-data-check utiltest-no-data-check: .DEFAULT
 
 
 # Note: md5sum is not installed on OS X, but openssl is. Openssl may not be
@@ -113,6 +115,9 @@
 testdata::
 .PHONY: utiltest
 utiltest:
+.PHONY: test-no-data-check exampletest-no-data-check utiltest-no-data-check
+test-no-data-check::
+exampletest-no-data-check utiltest-no-data-check:
 
 # Add compiler flags for intrinsic files
 ifeq ($(TOOLCHAIN), x86-os2-gcc)
diff --git a/build/make/configure.sh b/build/make/configure.sh
index c5bed61..688fa12 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -728,6 +728,13 @@
   # Handle darwin variants. Newer SDKs allow targeting older
   # platforms, so use the newest one available.
   case ${toolchain} in
+    arm*-darwin*)
+      ios_sdk_dir="$(show_darwin_sdk_path iphoneos)"
+      if [ -d "${ios_sdk_dir}" ]; then
+        add_cflags  "-isysroot ${ios_sdk_dir}"
+        add_ldflags "-isysroot ${ios_sdk_dir}"
+      fi
+      ;;
     *-darwin*)
       osx_sdk_dir="$(show_darwin_sdk_path macosx)"
       if [ -d "${osx_sdk_dir}" ]; then
@@ -803,7 +810,14 @@
           if disabled neon && enabled neon_asm; then
             die "Disabling neon while keeping neon-asm is not supported"
           fi
-          soft_enable media
+          case ${toolchain} in
+            *-darwin*)
+              # Neon is guaranteed on iOS 6+ devices, while old media extensions
+              # no longer assemble with iOS 9 SDK
+              ;;
+            *)
+              soft_enable media
+          esac
           ;;
         armv6)
           soft_enable media
diff --git a/libs.mk b/libs.mk
index 0ca8379..6215990 100644
--- a/libs.mk
+++ b/libs.mk
@@ -508,11 +508,13 @@
 
 define test_shard_template
 test:: test_shard.$(1)
-test_shard.$(1): $(LIBVPX_TEST_BIN) testdata
+test-no-data-check:: test_shard_ndc.$(1)
+test_shard.$(1) test_shard_ndc.$(1): $(LIBVPX_TEST_BIN)
 	@set -e; \
 	 export GTEST_SHARD_INDEX=$(1); \
 	 export GTEST_TOTAL_SHARDS=$(2); \
 	 $(LIBVPX_TEST_BIN)
+test_shard.$(1): testdata
 .PHONY: test_shard.$(1)
 endef
 
@@ -557,15 +559,16 @@
 # TODO(tomfinegan): Support running the debug versions of tools?
 TEST_BIN_PATH := $(addsuffix /$(TGT_OS:win64=x64)/Release, $(TEST_BIN_PATH))
 endif
-utiltest: testdata
+utiltest utiltest-no-data-check:
 	$(qexec)$(SRC_PATH_BARE)/test/vpxdec.sh \
 		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
 		--bin-path $(TEST_BIN_PATH)
 	$(qexec)$(SRC_PATH_BARE)/test/vpxenc.sh \
 		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
 		--bin-path $(TEST_BIN_PATH)
+utiltest: testdata
 else
-utiltest:
+utiltest utiltest-no-data-check:
 	@echo Unit tests must be enabled to make the utiltest target.
 endif
 
@@ -583,11 +586,12 @@
 # TODO(tomfinegan): Support running the debug versions of tools?
 EXAMPLES_BIN_PATH := $(TGT_OS:win64=x64)/Release
 endif
-exampletest: examples testdata
+exampletest exampletest-no-data-check: examples
 	$(qexec)$(SRC_PATH_BARE)/test/examples.sh \
 		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
 		--bin-path $(EXAMPLES_BIN_PATH)
+exampletest: testdata
 else
-exampletest:
+exampletest exampletest-no-data-check:
 	@echo Unit tests must be enabled to make the exampletest target.
 endif
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 7b4c435..46d4a25 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -316,8 +316,8 @@
                 vp9_dc_left_predictor_16x16_neon,
                 vp9_dc_top_predictor_16x16_neon,
                 vp9_dc_128_predictor_16x16_neon, vp9_v_predictor_16x16_neon,
-                vp9_h_predictor_16x16_neon, NULL, NULL, NULL, NULL, NULL, NULL,
-                vp9_tm_predictor_16x16_neon)
+                vp9_h_predictor_16x16_neon, vp9_d45_predictor_16x16_neon, NULL,
+                NULL, NULL, NULL, NULL, vp9_tm_predictor_16x16_neon)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
diff --git a/test/tools_common.sh b/test/tools_common.sh
index 60424ed..0bdcc08 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -409,6 +409,7 @@
 YUV_RAW_INPUT_HEIGHT=288
 
 Y4M_NOSQ_PAR_INPUT="${LIBVPX_TEST_DATA_PATH}/park_joy_90p_8_420_a10-1.y4m"
+Y4M_720P_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.y4m"
 
 # Setup a trap function to clean up after tests complete.
 trap cleanup EXIT
diff --git a/test/vpxenc.sh b/test/vpxenc.sh
index 1faa145..bf551a8 100755
--- a/test/vpxenc.sh
+++ b/test/vpxenc.sh
@@ -60,6 +60,10 @@
   echo ""${Y4M_NOSQ_PAR_INPUT}""
 }
 
+y4m_input_720p() {
+  echo ""${Y4M_720P_INPUT}""
+}
+
 # Echo default vpxenc real time encoding params. $1 is the codec, which defaults
 # to vp8 if unspecified.
 vpxenc_rt_params() {
@@ -68,7 +72,7 @@
     --buf-initial-sz=500
     --buf-optimal-sz=600
     --buf-sz=1000
-    --cpu-used=-5
+    --cpu-used=-6
     --end-usage=cbr
     --error-resilient=1
     --kf-max-dist=90000
@@ -258,6 +262,34 @@
   fi
 }
 
+vpxenc_vp9_webm_rt_multithread_tiled() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_multithread_tiled.webm"
+    local readonly tilethread_min=2
+    local readonly tilethread_max=4
+    local readonly num_threads="$(seq ${tilethread_min} ${tilethread_max})"
+    local readonly num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})"
+
+    for threads in ${num_threads}; do
+      for tile_cols in ${num_tile_cols}; do
+        vpxenc $(y4m_input_720p) \
+          $(vpxenc_rt_params vp9) \
+          --threads=${threads} \
+          --tile-columns=${tile_cols} \
+          --output="${output}"
+      done
+    done
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+
+    rm "${output}"
+  fi
+}
+
 vpxenc_vp9_webm_2pass() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
@@ -357,6 +389,7 @@
               vpxenc_vp9_ivf
               vpxenc_vp9_webm
               vpxenc_vp9_webm_rt
+              vpxenc_vp9_webm_rt_multithread_tiled
               vpxenc_vp9_webm_2pass
               vpxenc_vp9_ivf_lossless
               vpxenc_vp9_ivf_minq0_maxq0
diff --git a/vp8/common/arm/neon/vp8_subpixelvariance_neon.c b/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
index 974d3b6..3c8ed11 100644
--- a/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
+++ b/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
@@ -12,7 +12,7 @@
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
 
-static const uint16_t bilinear_taps_coeff[8][2] = {
+static const uint8_t bilinear_taps_coeff[8][2] = {
     {128,   0},
     {112,  16},
     { 96,  32},
@@ -972,9 +972,9 @@
                                       int pixel_step,
                                       unsigned int output_height,
                                       unsigned int output_width,
-                                      const uint16_t *vpx_filter) {
-  const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]);
-  const uint8x8_t f1 = vmov_n_u8((uint8_t)vpx_filter[1]);
+                                      const uint8_t *vpx_filter) {
+  const uint8x8_t f0 = vmov_n_u8(vpx_filter[0]);
+  const uint8x8_t f1 = vmov_n_u8(vpx_filter[1]);
   unsigned int i;
   for (i = 0; i < output_height; ++i) {
     const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c
index cfd5905..92706bf 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.c
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.c
@@ -358,6 +358,23 @@
   vst1_u8(dst + i * stride, row);
 }
 
+void vp9_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t A0 = vld1q_u8(above);  // top row
+  const uint8x16_t above_right = vld1q_dup_u8(above + 15);
+  const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
+  const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
+  const uint8x16_t avg1 = vhaddq_u8(A0, A2);
+  uint8x16_t row = vrhaddq_u8(avg1, A1);
+  int i;
+  (void)left;
+  for (i = 0; i < 15; ++i) {
+    vst1q_u8(dst + i * stride, row);
+    row = vextq_u8(row, above_right, 1);
+  }
+  vst1q_u8(dst + i * stride, row);
+}
+
 // -----------------------------------------------------------------------------
 
 void vp9_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 0de072a..604c03e 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -138,7 +138,7 @@
 specialize qw/vp9_d207_predictor_16x16/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d45_predictor_16x16/, "$ssse3_x86inc";
+specialize qw/vp9_d45_predictor_16x16 neon/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d63_predictor_16x16/, "$ssse3_x86inc";
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 0e4d863..6270bf4 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -51,7 +51,7 @@
   // Rate target ratio to set q delta.
   double rate_ratio_qdelta;
   // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
-  double rate_boost_fac;
+  int rate_boost_fac;
   double low_content_avg;
   int qindex_delta[3];
 };
@@ -129,7 +129,8 @@
   else  if (bsize >= BLOCK_16X16 &&
             rate < cr->thresh_rate_sb &&
             is_inter_block(mbmi) &&
-            mbmi->mv[0].as_int == 0)
+            mbmi->mv[0].as_int == 0 &&
+            cr->rate_boost_fac > 10)
     // More aggressive delta-q for bigger blocks with zero motion.
     return CR_SEGMENT_ID_BOOST2;
   else
@@ -464,10 +465,10 @@
       cm->height <= 288 &&
       rc->avg_frame_bandwidth < 3400) {
     cr->motion_thresh = 4;
-    cr->rate_boost_fac = 1.25;
+    cr->rate_boost_fac = 10;
   } else {
     cr->motion_thresh = 32;
-    cr->rate_boost_fac = 1.7;
+    cr->rate_boost_fac = 17;
   }
 }
 
@@ -541,9 +542,9 @@
     vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
 
     // Set a more aggressive (higher) q delta for segment BOOST2.
-    qindex_delta = compute_deltaq(cpi, cm->base_qindex,
-                                  MIN(CR_MAX_RATE_TARGET_RATIO,
-                                  cr->rate_boost_fac * cr->rate_ratio_qdelta));
+    qindex_delta = compute_deltaq(
+        cpi, cm->base_qindex, MIN(CR_MAX_RATE_TARGET_RATIO,
+        0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
     cr->qindex_delta[2] = qindex_delta;
     vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
 
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 425073f..85003f6 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1596,7 +1596,10 @@
     target = calc_pframe_target_size_one_pass_cbr(cpi);
 
   vp9_rc_set_frame_target(cpi, target);
-  cpi->resize_state = vp9_resize_one_pass_cbr(cpi);
+  if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC)
+    cpi->resize_state = vp9_resize_one_pass_cbr(cpi);
+  else
+    cpi->resize_state = 0;
 }
 
 int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
@@ -1781,7 +1784,7 @@
       ++cpi->resize_buffer_underflow;
     ++cpi->resize_count;
     // Check for resize action every "window" frames.
-    if (cpi->resize_count == window) {
+    if (cpi->resize_count >= window) {
       int avg_qp = cpi->resize_avg_qp / cpi->resize_count;
       // Resize down if buffer level has underflowed sufficent amount in past
       // window, and we are at original resolution.