Merge "configure.sh: avoid test -a/-o for compatibility"

diff --git a/libs.mk b/libs.mk
index 1e01639..25fbc2c 100644
--- a/libs.mk
+++ b/libs.mk

@@ -409,12 +409,16 @@
             curl -L -o $@ $(call libvpx_test_data_url,$(@F))
 
 testdata:: $(LIBVPX_TEST_DATA)
-	$(qexec)if [ -x "$$(which sha1sum)" ]; then\
+	$(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\
+          [ -x "$$(which shasum)" ] && sha1sum=shasum;\
+          [ -x "$$(which sha1)" ] && sha1sum=sha1;\
+          if [ -n "$${sha1sum}" ]; then\
+            set -e;\
             echo "Checking test data:";\
             if [ -n "$(LIBVPX_TEST_DATA)" ]; then\
                 for f in $(call enabled,LIBVPX_TEST_DATA); do\
                     grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\
-                        (cd $(LIBVPX_TEST_DATA_PATH); sha1sum -c);\
+                        (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\
                 done; \
             fi; \
         else\

diff --git a/test/decode_to_md5.sh b/test/decode_to_md5.sh
index 6cb7d0e..854b74f 100755
--- a/test/decode_to_md5.sh
+++ b/test/decode_to_md5.sh

@@ -44,8 +44,8 @@
 
   [ -e "${output_file}" ] || return 1
 
-  local md5_last_frame=$(tail -n1 "${output_file}")
-  local actual_md5=$(echo "${md5_last_frame% *}" | tr -d [:space:])
+  local md5_last_frame="$(tail -n1 "${output_file}" | awk '{print $1}')"
+  local actual_md5="$(echo "${md5_last_frame}" | awk '{print $1}')"
   [ "${actual_md5}" = "${expected_md5}" ] || return 1
 }
 

diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 98ac0e6..ee6289f 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1

@@ -679,4 +679,5 @@
 9a70e8b7d14fba9234d0e51dce876635413ce444  thaloundeskmtg_640_480_30.yuv
 e7d315dbf4f3928779e0dc624311196d44491d32  niklas_1280_720_30.yuv
 c77e4a26616add298a05dd5d12397be22c0e40c5  vp90-2-18-resize.ivf
-c77e4a26616add298a05dd5d12397be22c0e40c5  vp90-2-18-resize.ivf
+c12918cf0a716417fba2de35c3fc5ab90e52dfce  vp90-2-18-resize.ivf.md5
+717da707afcaa1f692ff1946f291054eb75a4f06  screendata.y4m

diff --git a/test/test.mk b/test/test.mk
index 53d4057..057e1e8 100644
--- a/test/test.mk
+++ b/test/test.mk

@@ -99,8 +99,8 @@
 
 LIBVPX_TEST_SRCS-yes                   += idct_test.cc
 LIBVPX_TEST_SRCS-yes                   += intrapred_test.cc
-LIBVPX_TEST_SRCS-yes                   += scale_border_test.cc
 LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
+LIBVPX_TEST_SRCS-yes                   += vpx_scale_test.cc
 
 endif # VP8
 

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 9dc7c6a..83b7435 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc

@@ -756,6 +756,18 @@
     ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2),
                       make_tuple(6, 6, subpel_avg_variance64x64_avx2)));
 #endif  // HAVE_AVX2
+#if HAVE_NEON
+const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon;
+INSTANTIATE_TEST_CASE_P(
+    NEON, VP9VarianceTest,
+    ::testing::Values(make_tuple(4, 4, variance16x16_neon)));
+
+const vp9_subpixvariance_fn_t subpel_variance16x16_neon =
+    vp9_sub_pixel_variance16x16_neon;
+INSTANTIATE_TEST_CASE_P(
+    NEON, VP9SubpelVarianceTest,
+    ::testing::Values(make_tuple(4, 4, subpel_variance16x16_neon)));
+#endif  // HAVE_NEON
 #endif  // CONFIG_VP9_ENCODER
 
 }  // namespace vp9

diff --git a/test/scale_border_test.cc b/test/vpx_scale_test.cc
similarity index 66%
rename from test/scale_border_test.cc
rename to test/vpx_scale_test.cc
index cc9a69a..b3302d9 100644
--- a/test/scale_border_test.cc
+++ b/test/vpx_scale_test.cc

@@ -21,11 +21,12 @@
 namespace {
 
 typedef void (*ExtendFrameBorderFunc)(YV12_BUFFER_CONFIG *ybf);
+typedef void (*CopyFrameFunc)(const YV12_BUFFER_CONFIG *src_ybf,
+                              YV12_BUFFER_CONFIG *dst_ybf);
 
-class ExtendBorderTest
-    : public ::testing::TestWithParam<ExtendFrameBorderFunc> {
+class VpxScaleBase {
  public:
-  virtual ~ExtendBorderTest() {
+  virtual ~VpxScaleBase() {
     libvpx_test::ClearSystemState();
   }
 
@@ -35,7 +36,6 @@
     vpx_memset(&img_, 0, sizeof(img_));
     ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&img_, width_, height_,
                                              VP8BORDERINPIXELS));
-
     vpx_memset(img_.buffer_alloc, kBufFiller, img_.frame_size);
     FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
               img_.y_stride);
@@ -47,31 +47,25 @@
     vpx_memset(&ref_img_, 0, sizeof(ref_img_));
     ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&ref_img_, width_, height_,
                                              VP8BORDERINPIXELS));
-
     vpx_memset(ref_img_.buffer_alloc, kBufFiller, ref_img_.frame_size);
-    FillPlane(ref_img_.y_buffer, ref_img_.y_crop_width, ref_img_.y_crop_height,
-              ref_img_.y_stride);
-    FillPlane(ref_img_.u_buffer,
-              ref_img_.uv_crop_width, ref_img_.uv_crop_height,
-              ref_img_.uv_stride);
-    FillPlane(ref_img_.v_buffer,
-              ref_img_.uv_crop_width, ref_img_.uv_crop_height,
-              ref_img_.uv_stride);
+
+    vpx_memset(&cpy_img_, 0, sizeof(cpy_img_));
+    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&cpy_img_, width_, height_,
+                                             VP8BORDERINPIXELS));
+    vpx_memset(cpy_img_.buffer_alloc, kBufFiller, cpy_img_.frame_size);
+    ReferenceCopyFrame();
   }
 
   void DeallocImage() {
     vp8_yv12_de_alloc_frame_buffer(&img_);
     vp8_yv12_de_alloc_frame_buffer(&ref_img_);
+    vp8_yv12_de_alloc_frame_buffer(&cpy_img_);
   }
 
- private:
+ protected:
   static const int kBufFiller = 123;
   static const int kBufMax = kBufFiller - 1;
 
-  virtual void SetUp() {
-    extend_fn_ = GetParam();
-  }
-
   static void FillPlane(uint8_t *buf, int width, int height, int stride) {
     for (int y = 0; y < height; ++y) {
       for (int x = 0; x < width; ++x) {
@@ -80,24 +74,6 @@
     }
   }
 
-  void ReferenceExtendBorder() {
-    ExtendPlane(ref_img_.y_buffer,
-                ref_img_.y_crop_width, ref_img_.y_crop_height,
-                ref_img_.y_width, ref_img_.y_height,
-                ref_img_.y_stride,
-                ref_img_.border);
-    ExtendPlane(ref_img_.u_buffer,
-                ref_img_.uv_crop_width, ref_img_.uv_crop_height,
-                ref_img_.uv_width, ref_img_.uv_height,
-                ref_img_.uv_stride,
-                ref_img_.border / 2);
-    ExtendPlane(ref_img_.v_buffer,
-                ref_img_.uv_crop_width, ref_img_.uv_crop_height,
-                ref_img_.uv_width, ref_img_.uv_height,
-                ref_img_.uv_stride,
-                ref_img_.border / 2);
-  }
-
   static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height,
                           int width, int height, int stride, int padding) {
     // Copy the outermost visible pixel to a distance of at least 'padding.'
@@ -136,17 +112,75 @@
     }
   }
 
+  void ReferenceExtendBorder() {
+    ExtendPlane(ref_img_.y_buffer,
+                ref_img_.y_crop_width, ref_img_.y_crop_height,
+                ref_img_.y_width, ref_img_.y_height,
+                ref_img_.y_stride,
+                ref_img_.border);
+    ExtendPlane(ref_img_.u_buffer,
+                ref_img_.uv_crop_width, ref_img_.uv_crop_height,
+                ref_img_.uv_width, ref_img_.uv_height,
+                ref_img_.uv_stride,
+                ref_img_.border / 2);
+    ExtendPlane(ref_img_.v_buffer,
+                ref_img_.uv_crop_width, ref_img_.uv_crop_height,
+                ref_img_.uv_width, ref_img_.uv_height,
+                ref_img_.uv_stride,
+                ref_img_.border / 2);
+  }
+
+  void ReferenceCopyFrame() {
+    // Copy img_ to ref_img_ and extend frame borders. This will be used for
+    // verifying extend_fn_ as well as copy_frame_fn_.
+    EXPECT_EQ(ref_img_.frame_size, img_.frame_size);
+    for (int y = 0; y < img_.y_crop_height; ++y) {
+      for (int x = 0; x < img_.y_crop_width; ++x) {
+        ref_img_.y_buffer[x + y * ref_img_.y_stride] =
+            img_.y_buffer[x + y * img_.y_stride];
+      }
+    }
+
+    for (int y = 0; y < img_.uv_crop_height; ++y) {
+      for (int x = 0; x < img_.uv_crop_width; ++x) {
+        ref_img_.u_buffer[x + y * ref_img_.uv_stride] =
+            img_.u_buffer[x + y * img_.uv_stride];
+        ref_img_.v_buffer[x + y * ref_img_.uv_stride] =
+            img_.v_buffer[x + y * img_.uv_stride];
+      }
+    }
+
+    ReferenceExtendBorder();
+  }
+
+  void CompareImages(const YV12_BUFFER_CONFIG actual) {
+    EXPECT_EQ(ref_img_.frame_size, actual.frame_size);
+    EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc,
+                        ref_img_.frame_size));
+  }
+
+  YV12_BUFFER_CONFIG img_;
+  YV12_BUFFER_CONFIG ref_img_;
+  YV12_BUFFER_CONFIG cpy_img_;
+  int width_;
+  int height_;
+};
+
+class ExtendBorderTest
+    : public VpxScaleBase,
+      public ::testing::TestWithParam<ExtendFrameBorderFunc> {
+ public:
+  virtual ~ExtendBorderTest() {}
+
+ protected:
+  virtual void SetUp() {
+    extend_fn_ = GetParam();
+  }
+
   void ExtendBorder() {
     ASM_REGISTER_STATE_CHECK(extend_fn_(&img_));
   }
 
-  void CompareImages() {
-    EXPECT_EQ(ref_img_.frame_size, img_.frame_size);
-    EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, img_.buffer_alloc,
-                        ref_img_.frame_size));
-  }
-
- protected:
   void RunTest() {
 #if ARCH_ARM
     // Some arm devices OOM when trying to allocate the largest buffers.
@@ -160,17 +194,13 @@
         ResetImage(kSizesToTest[w], kSizesToTest[h]);
         ExtendBorder();
         ReferenceExtendBorder();
-        CompareImages();
+        CompareImages(img_);
         DeallocImage();
       }
     }
   }
 
-  YV12_BUFFER_CONFIG img_;
-  YV12_BUFFER_CONFIG ref_img_;
   ExtendFrameBorderFunc extend_fn_;
-  int width_;
-  int height_;
 };
 
 TEST_P(ExtendBorderTest, ExtendBorder) {
@@ -179,4 +209,48 @@
 
 INSTANTIATE_TEST_CASE_P(C, ExtendBorderTest,
                         ::testing::Values(vp8_yv12_extend_frame_borders_c));
+
+class CopyFrameTest
+    : public VpxScaleBase,
+      public ::testing::TestWithParam<CopyFrameFunc> {
+ public:
+  virtual ~CopyFrameTest() {}
+
+ protected:
+  virtual void SetUp() {
+    copy_frame_fn_ = GetParam();
+  }
+
+  void CopyFrame() {
+    ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &cpy_img_));
+  }
+
+  void RunTest() {
+#if ARCH_ARM
+    // Some arm devices OOM when trying to allocate the largest buffers.
+    static const int kNumSizesToTest = 6;
+#else
+    static const int kNumSizesToTest = 7;
+#endif
+    static const int kSizesToTest[] = {1, 15, 33, 145, 512, 1025, 16383};
+    for (int h = 0; h < kNumSizesToTest; ++h) {
+      for (int w = 0; w < kNumSizesToTest; ++w) {
+        ResetImage(kSizesToTest[w], kSizesToTest[h]);
+        ReferenceCopyFrame();
+        CopyFrame();
+        CompareImages(cpy_img_);
+        DeallocImage();
+      }
+    }
+  }
+
+  CopyFrameFunc copy_frame_fn_;
+};
+
+TEST_P(CopyFrameTest, CopyFrame) {
+  ASSERT_NO_FATAL_FAILURE(RunTest());
+}
+
+INSTANTIATE_TEST_CASE_P(C, CopyFrameTest,
+                        ::testing::Values(vp8_yv12_copy_frame_c));
 }  // namespace

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index e81c05e..298f50f 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c

@@ -616,7 +616,7 @@
 
 #if CONFIG_TEMPORAL_DENOISING
         if (cpi->denoiser.aggressive_mode != 0 &&
-            Q < cpi->denoiser.denoise_pars.qp_thresh) {
+            Q < (int)cpi->denoiser.denoise_pars.qp_thresh) {
           // Under aggressive denoising mode, use segmentation to turn off loop
           // filter below some qp thresh. The loop filter is turned off for all
           // blocks that have been encoded as ZEROMV LAST x frames in a row,

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index d587749..3b1ca16 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -420,7 +420,7 @@
 specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x16 mmx avx2/, "$sse2_x86inc";
+specialize qw/vp9_variance16x16 mmx avx2 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";
@@ -435,7 +435,7 @@
 specialize qw/vp9_get8x8var mmx/, "$sse2_x86inc";
 
 add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get16x16var avx2/, "$sse2_x86inc";
+specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance8x4/, "$sse2_x86inc";
@@ -483,7 +483,7 @@
 specialize qw/vp9_sub_pixel_avg_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance16x16 neon/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
 specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -715,7 +715,7 @@
 specialize qw/vp9_subtract_block/, "$sse2_x86inc";
 
 add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp/, "$ssse3_x86_64";
+specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64";
 
 add_proto qw/void vp9_quantize_fp_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";

diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
new file mode 100644
index 0000000..2d5ec79
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c

@@ -0,0 +1,102 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include <math.h>
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_rd.h"
+
+void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
+                          int skip_block, const int16_t *zbin_ptr,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+                          int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                          int zbin_oq_value, uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
+  int i;
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)zbin_oq_value;
+  (void)scan;
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+
+    const int16x8_t v_zero = vdupq_n_s16(0);
+    const int16x8_t v_one = vdupq_n_s16(1);
+    int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+    int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
+    int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
+    int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
+    // adjust for dc
+    v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
+    v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+    v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+
+    for (i = 0; i < count; i += 8) {
+      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
+      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+      const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
+      const int16x8_t v_tmp = vqaddq_s16(v_abs_coeff, v_round);
+      const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp),
+                                           vget_low_s16(v_quant));
+      const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp),
+                                           vget_high_s16(v_quant));
+      const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16),
+                                            vshrn_n_s32(v_tmp_hi, 16));
+      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+      const int16x8_t v_nz_iscan =
+          vandq_s16(vmvnq_s16(vreinterpretq_s16_u16(v_nz_mask)), v_iscan_plus1);
+      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+
+      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+
+      vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
+      vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
+      v_round = vmovq_n_s16(round_ptr[1]);
+      v_quant = vmovq_n_s16(quant_ptr[1]);
+      v_dequant = vmovq_n_s16(dequant_ptr[1]);
+    }
+    {
+      const int16x4_t v_eobmax_3210 =
+          vmax_s16(vget_low_s16(v_eobmax_76543210),
+                   vget_high_s16(v_eobmax_76543210));
+      const int64x1_t v_eobmax_xx32 =
+          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+      const int16x4_t v_eobmax_tmp =
+          vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+      const int64x1_t v_eobmax_xxx3 =
+          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+      const int16x4_t v_eobmax_final =
+          vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+      *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+    }
+  } else {
+    vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+    vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+    *eob_ptr = 0;
+  }
+}

diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c
new file mode 100644
index 0000000..f687118
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_variance_neon.c

@@ -0,0 +1,129 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_filter.h"
+
+#include "vp9/encoder/vp9_variance.h"
+
+enum { kWidth16 = 16 };
+enum { kHeight16 = 16 };
+enum { kHeight16PlusOne = 17 };
+enum { kPixelStepOne = 1 };
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+  const int32x4_t a = vpaddlq_s16(v_16x8);
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+  const int64x2_t b = vpaddlq_s32(v_32x4);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static void variance_neon_w8(const uint8_t *a, int a_stride,
+                             const uint8_t *b, int b_stride,
+                             int w, int h, unsigned int *sse, int *sum) {
+  int i, j;
+  int16x8_t v_sum = vdupq_n_s16(0);
+  int32x4_t v_sse_lo = vdupq_n_s32(0);
+  int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 8) {
+      const uint8x8_t v_a = vld1_u8(&a[j]);
+      const uint8x8_t v_b = vld1_u8(&b[j]);
+      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+      v_sum = vaddq_s16(v_sum, sv_diff);
+      v_sse_lo = vmlal_s16(v_sse_lo,
+                           vget_low_s16(sv_diff),
+                           vget_low_s16(sv_diff));
+      v_sse_hi = vmlal_s16(v_sse_hi,
+                           vget_high_s16(sv_diff),
+                           vget_high_s16(sv_diff));
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+
+  *sum = horizontal_add_s16x8(v_sum);
+  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
+                          const uint8_t *ref_ptr, int ref_stride,
+                          unsigned int *sse, int *sum) {
+  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16,
+                   kHeight16, sse, sum);
+}
+
+unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum);
+  return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16));
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
+                                       uint8_t *output_ptr,
+                                       unsigned int src_pixels_per_line,
+                                       int pixel_step,
+                                       unsigned int output_height,
+                                       unsigned int output_width,
+                                       const int16_t *vp9_filter) {
+  const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);
+  const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]);
+  unsigned int i;
+  for (i = 0; i < output_height; ++i) {
+    const uint8x16_t src_0 = vld1q_u8(&src_ptr[0]);
+    const uint8x16_t src_1 = vld1q_u8(&src_ptr[pixel_step]);
+    const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
+    const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
+    const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
+    const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
+    const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
+    const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
+    vst1q_u8(&output_ptr[0], vcombine_u8(out_lo, out_hi));
+    // Next row...
+    src_ptr += src_pixels_per_line;
+    output_ptr += output_width;
+  }
+}
+
+unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
+                                              int src_stride,
+                                              int xoffset,
+                                              int yoffset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED_ARRAY(kWidth16, uint8_t, temp2, kHeight16 * kWidth16);
+  DECLARE_ALIGNED_ARRAY(kWidth16, uint8_t, fdata3, kHeight16PlusOne * kWidth16);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
+                             kHeight16PlusOne, kWidth16,
+                             BILINEAR_FILTERS_2TAP(xoffset));
+  var_filter_block2d_bil_w16(fdata3, temp2, kWidth16, kWidth16, kHeight16,
+                             kWidth16, BILINEAR_FILTERS_2TAP(yoffset));
+  return vp9_variance16x16_neon(temp2, kWidth16, dst, dst_stride, sse);
+}

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 584bcb8..74eaae5 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -1836,7 +1836,7 @@
   BLOCK_SIZE max_size = BLOCK_8X8;
   int bsl = mi_width_log2(BLOCK_64X64);
   const int search_range_ctrl = (((mi_row + mi_col) >> bsl) +
-                                     get_chessboard_index(cm)) % 2;
+                       get_chessboard_index(cm->current_video_frame)) & 0x1;
   // Trap case where we do not have a prediction.
   if (search_range_ctrl &&
       (left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) {
@@ -1880,6 +1880,60 @@
   *max_block_size = max_size;
 }
 
+// TODO(jingning) refactor functions setting partition search range
+static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                BLOCK_SIZE *min_bs, BLOCK_SIZE *max_bs) {
+  int mi_width  = num_8x8_blocks_wide_lookup[bsize];
+  int mi_height = num_8x8_blocks_high_lookup[bsize];
+  int idx, idy;
+
+  MODE_INFO *mi;
+  MODE_INFO **prev_mi =
+      &cm->prev_mi_grid_visible[mi_row * cm->mi_stride + mi_col];
+  BLOCK_SIZE bs, min_size, max_size;
+
+  min_size = BLOCK_64X64;
+  max_size = BLOCK_4X4;
+
+  if (prev_mi) {
+    for (idy = 0; idy < mi_height; ++idy) {
+      for (idx = 0; idx < mi_width; ++idx) {
+        mi = prev_mi[idy * cm->mi_stride + idx];
+        bs = mi ? mi->mbmi.sb_type : bsize;
+        min_size = MIN(min_size, bs);
+        max_size = MAX(max_size, bs);
+      }
+    }
+  }
+
+  if (xd->left_available) {
+    for (idy = 0; idy < mi_height; ++idy) {
+      mi = xd->mi[idy * cm->mi_stride - 1];
+      bs = mi ? mi->mbmi.sb_type : bsize;
+      min_size = MIN(min_size, bs);
+      max_size = MAX(max_size, bs);
+    }
+  }
+
+  if (xd->up_available) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      mi = xd->mi[idx - cm->mi_stride];
+      bs = mi ? mi->mbmi.sb_type : bsize;
+      min_size = MIN(min_size, bs);
+      max_size = MAX(max_size, bs);
+    }
+  }
+
+  if (min_size == max_size) {
+    min_size = min_partition_size[min_size];
+    max_size = max_partition_size[max_size];
+  }
+
+  *min_bs = min_size;
+  *max_bs = max_size;
+}
+
 static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
   vpx_memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
 }
@@ -1888,6 +1942,15 @@
   vpx_memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
 }
 
+#if CONFIG_FP_MB_STATS
+const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] =
+  {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4};
+const int num_16x16_blocks_high_lookup[BLOCK_SIZES] =
+  {1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4};
+const int qindex_skip_threshold_lookup[BLOCK_SIZES] =
+  {0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120};
+#endif
+
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
@@ -1917,6 +1980,9 @@
   const int xss = x->e_mbd.plane[1].subsampling_x;
   const int yss = x->e_mbd.plane[1].subsampling_y;
 
+  BLOCK_SIZE min_size = cpi->sf.min_partition_size;
+  BLOCK_SIZE max_size = cpi->sf.max_partition_size;
+
   int partition_none_allowed = !force_horz_split && !force_vert_split;
   int partition_horz_allowed = !force_vert_split && yss <= xss &&
                                bsize >= BLOCK_8X8;
@@ -1931,18 +1997,24 @@
     set_offsets(cpi, tile, mi_row, mi_col, bsize);
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
   }
+
+  if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
+    int cb_partition_search_ctrl = ((pc_tree->index == 0 || pc_tree->index == 3)
+        + get_chessboard_index(cm->current_video_frame)) & 0x1;
+
+    if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size)
+      set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size);
+  }
+
   // Determine partition types in search according to the speed features.
   // The threshold set here has to be of square block size.
   if (cpi->sf.auto_min_max_partition_size) {
-    partition_none_allowed &= (bsize <= cpi->sf.max_partition_size &&
-                               bsize >= cpi->sf.min_partition_size);
-    partition_horz_allowed &= ((bsize <= cpi->sf.max_partition_size &&
-                                bsize >  cpi->sf.min_partition_size) ||
+    partition_none_allowed &= (bsize <= max_size && bsize >= min_size);
+    partition_horz_allowed &= ((bsize <= max_size && bsize > min_size) ||
                                 force_horz_split);
-    partition_vert_allowed &= ((bsize <= cpi->sf.max_partition_size &&
-                                bsize >  cpi->sf.min_partition_size) ||
+    partition_vert_allowed &= ((bsize <= max_size && bsize > min_size) ||
                                 force_vert_split);
-    do_split &= bsize > cpi->sf.min_partition_size;
+    do_split &= bsize > min_size;
   }
   if (cpi->sf.use_square_partition_only) {
     partition_horz_allowed &= force_horz_split;
@@ -1993,6 +2065,52 @@
           do_split = 0;
           do_rect = 0;
         }
+
+#if CONFIG_FP_MB_STATS
+        // Check if every 16x16 first pass block statistics has zero
+        // motion and the corresponding first pass residue is small enough.
+        // If that is the case, check the difference variance between the
+        // current frame and the last frame. If the variance is small enough,
+        // stop further splitting in RD optimization
+        if (cpi->use_fp_mb_stats && do_split != 0 &&
+            cm->base_qindex > qindex_skip_threshold_lookup[bsize]) {
+          VP9_COMMON *cm = &cpi->common;
+          int mb_row = mi_row >> 1;
+          int mb_col = mi_col >> 1;
+          int mb_row_end =
+              MIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+          int mb_col_end =
+              MIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+          int r, c;
+
+          int skip = 1;
+          for (r = mb_row; r < mb_row_end; r++) {
+            for (c = mb_col; c < mb_col_end; c++) {
+              const int mb_index = r * cm->mb_cols + c;
+              if ((cpi->twopass.this_frame_mb_stats[mb_index] &
+                   FPMB_NONZERO_MOTION_MASK) ||
+                  !(cpi->twopass.this_frame_mb_stats[mb_index] &
+                    FPMB_ERROR_LEVEL0_MASK)) {
+                skip = 0;
+                break;
+              }
+            }
+            if (skip == 0) {
+              break;
+            }
+          }
+          if (skip) {
+            unsigned int var;
+            set_offsets(cpi, tile, mi_row, mi_col, bsize);
+            var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src,
+                                                 mi_row, mi_col, bsize);
+            if (var < 8) {
+              do_split = 0;
+              do_rect = 0;
+            }
+          }
+        }
+#endif
       }
     }
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -2030,6 +2148,7 @@
         if (cpi->sf.adaptive_motion_search)
           load_pred_mv(x, ctx);
 
+        pc_tree->split[i]->index = i;
         rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
                           subsize, &this_rate, &this_dist, i != 3,
                           best_rd - sum_rd, pc_tree->split[i]);
@@ -2225,6 +2344,7 @@
     }
 
     vp9_zero(cpi->mb.pred_mv);
+    cpi->pc_root->index = 0;
 
     if ((sf->partition_search_type == SEARCH_PARTITION &&
          sf->use_lastframe_partitioning) ||

diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 4b3f2ad..2a6c4b3 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h

@@ -551,8 +551,8 @@
                                                          : 0];
 }
 
-static INLINE int get_chessboard_index(const VP9_COMMON *cm) {
-  return cm->current_video_frame % 2;
+static INLINE int get_chessboard_index(const int frame_index) {
+  return frame_index & 0x1;
 }
 
 #ifdef __cplusplus

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 30a0e9d..7a16001 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -394,7 +394,8 @@
   INTERP_FILTER filter_ref = cm->interp_filter;
   int bsl = mi_width_log2(bsize);
   const int pred_filter_search = cm->interp_filter == SWITCHABLE ?
-      (((mi_row + mi_col) >> bsl) + get_chessboard_index(cm)) % 2 : 0;
+      (((mi_row + mi_col) >> bsl) +
+       get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
   int const_motion[MAX_REF_FRAMES] = { 0 };
   int bh = num_4x4_blocks_high_lookup[bsize] << 2;
   int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
@@ -409,6 +410,10 @@
   PRED_BUFFER *this_mode_pred = NULL;
   int i;
 
+  // CTX is used by the temporal denoiser which is currently being developed.
+  // TODO(jbb): when temporal denoiser is finished and in the default build
+  // remove the following line;
+  (void) ctx;
   if (cpi->sf.reuse_inter_pred_sby) {
     for (i = 0; i < 3; i++) {
       tmp[i].data = &pred_buf[pixels_in_block * i];

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f65ac7b..c6580ee 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -2025,7 +2025,8 @@
 
   int bsl = mi_width_log2_lookup[bsize];
   int pred_filter_search = cpi->sf.cb_pred_filter_search ?
-      (((mi_row + mi_col) >> bsl)) & 0x01 : 0;
+      (((mi_row + mi_col) >> bsl) +
+       get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
 
   if (pred_filter_search) {
     INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;

diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 7315dd4..6ca1bc5 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c

@@ -110,10 +110,12 @@
   if (speed >= 3) {
     sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
                                                         : USE_LARGESTALL;
-    if (MIN(cm->width, cm->height) >= 720)
+    if (MIN(cm->width, cm->height) >= 720) {
       sf->disable_split_mask = DISABLE_ALL_SPLIT;
-    else
+      sf->cb_partition_search = frame_is_boosted(cpi) ? 0 : 1;
+    } else {
       sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+    }
 
     sf->adaptive_pred_interp_filter = 0;
     sf->cb_pred_filter_search = 1;
@@ -334,6 +336,7 @@
   sf->adaptive_motion_search = 0;
   sf->adaptive_pred_interp_filter = 0;
   sf->cb_pred_filter_search = 0;
+  sf->cb_partition_search = 0;
   sf->use_quant_fp = 0;
   sf->reference_masking = 0;
   sf->partition_search_type = SEARCH_PARTITION;

diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 929acaf..de731ce 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h

@@ -286,6 +286,8 @@
   // Chessboard pattern prediction filter type search
   int cb_pred_filter_search;
 
+  int cb_partition_search;
+
   // Fast quantization process path
   int use_quant_fp;
 

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index b3dc0b1..197fdba 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c

@@ -305,6 +305,16 @@
   return VPX_CODEC_OK;
 }
 
+static int get_image_bps(const vpx_image_t *img) {
+  switch (img->fmt) {
+    case VPX_IMG_FMT_YV12:
+    case VPX_IMG_FMT_I420: return 12;
+    case VPX_IMG_FMT_I422: return 16;
+    case VPX_IMG_FMT_I444: return 24;
+    default: assert(0 && "Invalid image format");
+  }
+  return 0;
+}
 
 static vpx_codec_err_t set_encoder_config(
     VP9EncoderConfig *oxcf,
@@ -672,16 +682,6 @@
     priv->extra_cfg = extracfg_map[i].cfg;
     priv->extra_cfg.pkt_list = &priv->pkt_list.head;
 
-     // Maximum buffer size approximated based on having multiple ARF.
-    priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 8;
-
-    if (priv->cx_data_sz < 4096)
-      priv->cx_data_sz = 4096;
-
-    priv->cx_data = (unsigned char *)malloc(priv->cx_data_sz);
-    if (priv->cx_data == NULL)
-      return VPX_CODEC_MEM_ERROR;
-
     vp9_initialize_enc();
 
     res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
@@ -806,8 +806,24 @@
                                       unsigned long deadline) {
   vpx_codec_err_t res = VPX_CODEC_OK;
 
-  if (img)
+  if (img != NULL) {
     res = validate_img(ctx, img);
+    // TODO(jzern) the checks related to cpi's validity should be treated as a
+    // failure condition, encoder setup is done fully in init() currently.
+    if (res == VPX_CODEC_OK && ctx->cpi != NULL && ctx->cx_data == NULL) {
+      // There's no codec control for multiple alt-refs so check the encoder
+      // instance for its status to determine the compressed data size.
+      ctx->cx_data_sz = ctx->cfg.g_w * ctx->cfg.g_h *
+                        get_image_bps(img) / 8 *
+                        (ctx->cpi->multi_arf_allowed ? 8 : 2);
+      if (ctx->cx_data_sz < 4096) ctx->cx_data_sz = 4096;
+
+      ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz);
+      if (ctx->cx_data == NULL) {
+        return VPX_CODEC_MEM_ERROR;
+      }
+    }
+  }
 
   pick_quickcompress_mode(ctx, duration, deadline);
   vpx_codec_pkt_list_init(&ctx->pkt_list);

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 0ea28d3..c8e7c5e 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -131,5 +131,7 @@
 
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))

diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm
deleted file mode 100644
index 696f47a..0000000
--- a/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm
+++ /dev/null

@@ -1,233 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_yv12_copy_frame_func_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    INCLUDE vpx_scale_asm_offsets.asm
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp8_yv12_copy_frame_func_neon(const YV12_BUFFER_CONFIG *src_ybc,
-;                                   YV12_BUFFER_CONFIG *dst_ybc);
-
-|vp8_yv12_copy_frame_func_neon| PROC
-    push            {r4 - r11, lr}
-    vpush           {d8 - d15}
-
-    sub             sp, sp, #16
-
-    ;Copy Y plane
-    ldr             r8, [r0, #yv12_buffer_config_u_buffer]       ;srcptr1
-    ldr             r9, [r1, #yv12_buffer_config_u_buffer]       ;srcptr1
-    ldr             r10, [r0, #yv12_buffer_config_v_buffer]      ;srcptr1
-    ldr             r11, [r1, #yv12_buffer_config_v_buffer]      ;srcptr1
-
-    ldr             r4, [r0, #yv12_buffer_config_y_height]
-    ldr             r5, [r0, #yv12_buffer_config_y_width]
-    ldr             r6, [r0, #yv12_buffer_config_y_stride]
-    ldr             r7, [r1, #yv12_buffer_config_y_stride]
-    ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
-    ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
-
-    str             r8, [sp]
-    str             r9, [sp, #4]
-    str             r10, [sp, #8]
-    str             r11, [sp, #12]
-
-    ; copy two rows at one time
-    mov             lr, r4, lsr #1
-
-cp_src_to_dst_height_loop
-    mov             r8, r2
-    mov             r9, r3
-    add             r10, r2, r6
-    add             r11, r3, r7
-    movs            r12, r5, lsr #7
-    ble             extra_cp_needed   ; y_width < 128
-
-cp_src_to_dst_width_loop
-    vld1.8          {q0, q1}, [r8]!
-    vld1.8          {q8, q9}, [r10]!
-    vld1.8          {q2, q3}, [r8]!
-    vld1.8          {q10, q11}, [r10]!
-    vld1.8          {q4, q5}, [r8]!
-    vld1.8          {q12, q13}, [r10]!
-    vld1.8          {q6, q7}, [r8]!
-    vld1.8          {q14, q15}, [r10]!
-
-    subs            r12, r12, #1
-
-    vst1.8          {q0, q1}, [r9]!
-    vst1.8          {q8, q9}, [r11]!
-    vst1.8          {q2, q3}, [r9]!
-    vst1.8          {q10, q11}, [r11]!
-    vst1.8          {q4, q5}, [r9]!
-    vst1.8          {q12, q13}, [r11]!
-    vst1.8          {q6, q7}, [r9]!
-    vst1.8          {q14, q15}, [r11]!
-
-    bne             cp_src_to_dst_width_loop
-
-    subs            lr, lr, #1
-    add             r2, r2, r6, lsl #1
-    add             r3, r3, r7, lsl #1
-
-    bne             cp_src_to_dst_height_loop
-
-extra_cp_needed
-    ands            r10, r5, #0x7f                  ;check to see if extra copy is needed
-    sub             r11, r5, r10
-    ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
-    ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
-    bne             extra_cp_src_to_dst_width
-end_of_cp_src_to_dst
-
-;Copy U & V planes
-    ldr             r2, [sp]        ;srcptr1
-    ldr             r3, [sp, #4]        ;dstptr1
-    mov             r4, r4, lsr #1                  ;src uv_height
-    mov             r5, r5, lsr #1                  ;src uv_width
-    mov             r6, r6, lsr #1                  ;src uv_stride
-    mov             r7, r7, lsr #1                  ;dst uv_stride
-
-    mov             r1, #2
-
-cp_uv_loop
-
-    ;copy two rows at one time
-    mov             lr, r4, lsr #1
-
-cp_src_to_dst_height_uv_loop
-    mov             r8, r2
-    mov             r9, r3
-    add             r10, r2, r6
-    add             r11, r3, r7
-    movs            r12, r5, lsr #6
-    ble             extra_uv_cp_needed
-
-cp_src_to_dst_width_uv_loop
-    vld1.8          {q0, q1}, [r8]!
-    vld1.8          {q8, q9}, [r10]!
-    vld1.8          {q2, q3}, [r8]!
-    vld1.8          {q10, q11}, [r10]!
-
-    subs            r12, r12, #1
-
-    vst1.8          {q0, q1}, [r9]!
-    vst1.8          {q8, q9}, [r11]!
-    vst1.8          {q2, q3}, [r9]!
-    vst1.8          {q10, q11}, [r11]!
-
-    bne             cp_src_to_dst_width_uv_loop
-
-    subs            lr, lr, #1
-    add             r2, r2, r6, lsl #1
-    add             r3, r3, r7, lsl #1
-
-    bne             cp_src_to_dst_height_uv_loop
-
-extra_uv_cp_needed
-    ands            r10, r5, #0x3f                  ;check to see if extra copy is needed
-    sub             r11, r5, r10
-    ldr             r2, [sp]        ;srcptr1
-    ldr             r3, [sp, #4]        ;dstptr1
-    bne             extra_cp_src_to_dst_uv_width
-end_of_cp_src_to_dst_uv
-
-    subs            r1, r1, #1
-
-    addne               sp, sp, #8
-
-    ldrne               r2, [sp]        ;srcptr1
-    ldrne               r3, [sp, #4]        ;dstptr1
-
-    bne             cp_uv_loop
-
-    add             sp, sp, #8
-
-    vpop            {d8 - d15}
-    pop             {r4 - r11, pc}
-
-;=============================
-extra_cp_src_to_dst_width
-    add             r2, r2, r11
-    add             r3, r3, r11
-    add             r0, r8, r6
-    add             r11, r9, r7
-
-    mov             lr, r4, lsr #1
-extra_cp_src_to_dst_height_loop
-    mov             r8, r2
-    mov             r9, r3
-    add             r0, r8, r6
-    add             r11, r9, r7
-
-    mov             r12, r10
-
-extra_cp_src_to_dst_width_loop
-    vld1.8          {q0}, [r8]!
-    vld1.8          {q1}, [r0]!
-
-    subs            r12, r12, #16
-
-    vst1.8          {q0}, [r9]!
-    vst1.8          {q1}, [r11]!
-    bne             extra_cp_src_to_dst_width_loop
-
-    subs            lr, lr, #1
-
-    add             r2, r2, r6, lsl #1
-    add             r3, r3, r7, lsl #1
-
-    bne             extra_cp_src_to_dst_height_loop
-
-    b               end_of_cp_src_to_dst
-
-;=================================
-extra_cp_src_to_dst_uv_width
-    add             r2, r2, r11
-    add             r3, r3, r11
-    add             r0, r8, r6
-    add             r11, r9, r7
-
-    mov             lr, r4, lsr #1
-extra_cp_src_to_dst_height_uv_loop
-    mov             r8, r2
-    mov             r9, r3
-    add             r0, r8, r6
-    add             r11, r9, r7
-
-    mov             r12, r10
-
-extra_cp_src_to_dst_width_uv_loop
-    vld1.8          {d0}, [r8]!
-    vld1.8          {d1}, [r0]!
-
-    subs            r12, r12, #8
-
-    vst1.8          {d0}, [r9]!
-    vst1.8          {d1}, [r11]!
-    bne             extra_cp_src_to_dst_width_uv_loop
-
-    subs            lr, lr, #1
-
-    add             r2, r2, r6, lsl #1
-    add             r3, r3, r7, lsl #1
-
-    bne             extra_cp_src_to_dst_height_uv_loop
-
-    b               end_of_cp_src_to_dst_uv
-
-    ENDP
-    END

diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm
deleted file mode 100644
index d3306b6..0000000
--- a/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm
+++ /dev/null

@@ -1,259 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_yv12_copy_src_frame_func_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    INCLUDE vpx_scale_asm_offsets.asm
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: This function is used to copy source data in src_buffer[i] at beginning
-;of the encoding. The buffer has a width and height of cpi->oxcf.Width and
-;cpi->oxcf.Height, which can be ANY numbers(NOT always multiples of 16 or 4).
-
-;void vp8_yv12_copy_src_frame_func_neon(const YV12_BUFFER_CONFIG *src_ybc,
-;                                       YV12_BUFFER_CONFIG *dst_ybc);
-
-|vp8_yv12_copy_src_frame_func_neon| PROC
-    push            {r4 - r11, lr}
-    vpush           {d8 - d15}
-
-    ;Copy Y plane
-    ldr             r4, [r0, #yv12_buffer_config_y_height]
-    ldr             r5, [r0, #yv12_buffer_config_y_width]
-    ldr             r6, [r0, #yv12_buffer_config_y_stride]
-    ldr             r7, [r1, #yv12_buffer_config_y_stride]
-    ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
-    ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
-
-    add             r10, r2, r6             ;second row src
-    add             r11, r3, r7             ;second row dst
-    mov             r6, r6, lsl #1
-    mov             r7, r7, lsl #1
-    sub             r6, r6, r5              ;adjust stride
-    sub             r7, r7, r5
-
-    ; copy two rows at one time
-    mov             lr, r4, lsr #1
-
-cp_src_to_dst_height_loop
-    mov             r12, r5
-
-cp_width_128_loop
-    vld1.8          {q0, q1}, [r2]!
-    vld1.8          {q4, q5}, [r10]!
-    vld1.8          {q2, q3}, [r2]!
-    vld1.8          {q6, q7}, [r10]!
-    vld1.8          {q8, q9}, [r2]!
-    vld1.8          {q12, q13}, [r10]!
-    vld1.8          {q10, q11}, [r2]!
-    vld1.8          {q14, q15}, [r10]!
-    sub             r12, r12, #128
-    cmp             r12, #128
-    vst1.8          {q0, q1}, [r3]!
-    vst1.8          {q4, q5}, [r11]!
-    vst1.8          {q2, q3}, [r3]!
-    vst1.8          {q6, q7}, [r11]!
-    vst1.8          {q8, q9}, [r3]!
-    vst1.8          {q12, q13}, [r11]!
-    vst1.8          {q10, q11}, [r3]!
-    vst1.8          {q14, q15}, [r11]!
-    bhs             cp_width_128_loop
-
-    cmp             r12, #0
-    beq             cp_width_done
-
-cp_width_8_loop
-    vld1.8          {d0}, [r2]!
-    vld1.8          {d1}, [r10]!
-    sub             r12, r12, #8
-    cmp             r12, #8
-    vst1.8          {d0}, [r3]!
-    vst1.8          {d1}, [r11]!
-    bhs             cp_width_8_loop
-
-    cmp             r12, #0
-    beq             cp_width_done
-
-cp_width_1_loop
-    ldrb            r8, [r2], #1
-    subs            r12, r12, #1
-    strb            r8, [r3], #1
-    ldrb            r8, [r10], #1
-    strb            r8, [r11], #1
-    bne             cp_width_1_loop
-
-cp_width_done
-    subs            lr, lr, #1
-    add             r2, r2, r6
-    add             r3, r3, r7
-    add             r10, r10, r6
-    add             r11, r11, r7
-    bne             cp_src_to_dst_height_loop
-
-;copy last line for Y if y_height is odd
-    tst             r4, #1
-    beq             cp_width_done_1
-    mov             r12, r5
-
-cp_width_128_loop_1
-    vld1.8          {q0, q1}, [r2]!
-    vld1.8          {q2, q3}, [r2]!
-    vld1.8          {q8, q9}, [r2]!
-    vld1.8          {q10, q11}, [r2]!
-    sub             r12, r12, #128
-    cmp             r12, #128
-    vst1.8          {q0, q1}, [r3]!
-    vst1.8          {q2, q3}, [r3]!
-    vst1.8          {q8, q9}, [r3]!
-    vst1.8          {q10, q11}, [r3]!
-    bhs             cp_width_128_loop_1
-
-    cmp             r12, #0
-    beq             cp_width_done_1
-
-cp_width_8_loop_1
-    vld1.8          {d0}, [r2]!
-    sub             r12, r12, #8
-    cmp             r12, #8
-    vst1.8          {d0}, [r3]!
-    bhs             cp_width_8_loop_1
-
-    cmp             r12, #0
-    beq             cp_width_done_1
-
-cp_width_1_loop_1
-    ldrb            r8, [r2], #1
-    subs            r12, r12, #1
-    strb            r8, [r3], #1
-    bne             cp_width_1_loop_1
-cp_width_done_1
-
-;Copy U & V planes
-    ldr             r4, [r0, #yv12_buffer_config_uv_height]
-    ldr             r5, [r0, #yv12_buffer_config_uv_width]
-    ldr             r6, [r0, #yv12_buffer_config_uv_stride]
-    ldr             r7, [r1, #yv12_buffer_config_uv_stride]
-    ldr             r2, [r0, #yv12_buffer_config_u_buffer]       ;srcptr1
-    ldr             r3, [r1, #yv12_buffer_config_u_buffer]       ;dstptr1
-
-    add             r10, r2, r6             ;second row src
-    add             r11, r3, r7             ;second row dst
-    mov             r6, r6, lsl #1
-    mov             r7, r7, lsl #1
-    sub             r6, r6, r5              ;adjust stride
-    sub             r7, r7, r5
-
-    mov             r9, #2
-
-cp_uv_loop
-    ;copy two rows at one time
-    mov             lr, r4, lsr #1
-
-cp_src_to_dst_height_uv_loop
-    mov             r12, r5
-
-cp_width_uv_64_loop
-    vld1.8          {q0, q1}, [r2]!
-    vld1.8          {q4, q5}, [r10]!
-    vld1.8          {q2, q3}, [r2]!
-    vld1.8          {q6, q7}, [r10]!
-    sub             r12, r12, #64
-    cmp             r12, #64
-    vst1.8          {q0, q1}, [r3]!
-    vst1.8          {q4, q5}, [r11]!
-    vst1.8          {q2, q3}, [r3]!
-    vst1.8          {q6, q7}, [r11]!
-    bhs             cp_width_uv_64_loop
-
-    cmp             r12, #0
-    beq             cp_width_uv_done
-
-cp_width_uv_8_loop
-    vld1.8          {d0}, [r2]!
-    vld1.8          {d1}, [r10]!
-    sub             r12, r12, #8
-    cmp             r12, #8
-    vst1.8          {d0}, [r3]!
-    vst1.8          {d1}, [r11]!
-    bhs             cp_width_uv_8_loop
-
-    cmp             r12, #0
-    beq             cp_width_uv_done
-
-cp_width_uv_1_loop
-    ldrb            r8, [r2], #1
-    subs            r12, r12, #1
-    strb            r8, [r3], #1
-    ldrb            r8, [r10], #1
-    strb            r8, [r11], #1
-    bne             cp_width_uv_1_loop
-
-cp_width_uv_done
-    subs            lr, lr, #1
-    add             r2, r2, r6
-    add             r3, r3, r7
-    add             r10, r10, r6
-    add             r11, r11, r7
-    bne             cp_src_to_dst_height_uv_loop
-
-;copy last line for U & V if uv_height is odd
-    tst             r4, #1
-    beq             cp_width_uv_done_1
-    mov             r12, r5
-
-cp_width_uv_64_loop_1
-    vld1.8          {q0, q1}, [r2]!
-    vld1.8          {q2, q3}, [r2]!
-    sub             r12, r12, #64
-    cmp             r12, #64
-    vst1.8          {q0, q1}, [r3]!
-    vst1.8          {q2, q3}, [r3]!
-    bhs             cp_width_uv_64_loop_1
-
-    cmp             r12, #0
-    beq             cp_width_uv_done_1
-
-cp_width_uv_8_loop_1
-    vld1.8          {d0}, [r2]!
-    sub             r12, r12, #8
-    cmp             r12, #8
-    vst1.8          {d0}, [r3]!
-    bhs             cp_width_uv_8_loop_1
-
-    cmp             r12, #0
-    beq             cp_width_uv_done_1
-
-cp_width_uv_1_loop_1
-    ldrb            r8, [r2], #1
-    subs            r12, r12, #1
-    strb            r8, [r3], #1
-    bne             cp_width_uv_1_loop_1
-cp_width_uv_done_1
-
-    subs            r9, r9, #1
-    ldrne           r2, [r0, #yv12_buffer_config_v_buffer]      ;srcptr1
-    ldrne           r3, [r1, #yv12_buffer_config_v_buffer]      ;dstptr1
-    ldrne           r10, [r0, #yv12_buffer_config_uv_stride]
-    ldrne           r11, [r1, #yv12_buffer_config_uv_stride]
-
-    addne           r10, r2, r10                ;second row src
-    addne           r11, r3, r11                ;second row dst
-
-    bne             cp_uv_loop
-
-    vpop            {d8 - d15}
-    pop             {r4 - r11, pc}
-
-    ENDP
-    END

diff --git a/vpx_scale/arm/neon/yv12extend_arm.c b/vpx_scale/arm/neon/yv12extend_arm.c
deleted file mode 100644
index d408eb3..0000000
--- a/vpx_scale/arm/neon/yv12extend_arm.c
+++ /dev/null

@@ -1,21 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_scale_rtcd.h"
-
-extern void vp8_yv12_copy_frame_func_neon(
-    const struct yv12_buffer_config *src_ybc,
-    struct yv12_buffer_config *dst_ybc);
-
-void vp8_yv12_copy_frame_neon(const struct yv12_buffer_config *src_ybc,
-                              struct yv12_buffer_config *dst_ybc) {
-  vp8_yv12_copy_frame_func_neon(src_ybc, dst_ybc);
-  vp8_yv12_extend_frame_borders_c(dst_ybc);
-}

diff --git a/vpx_scale/vpx_scale.mk b/vpx_scale/vpx_scale.mk
index 1fa41af..0a1594b 100644
--- a/vpx_scale/vpx_scale.mk
+++ b/vpx_scale/vpx_scale.mk

@@ -9,11 +9,6 @@
 SCALE_SRCS-yes += vpx_scale_rtcd.c
 SCALE_SRCS-yes += vpx_scale_rtcd.pl
 
-#neon
-SCALE_SRCS-$(HAVE_NEON_ASM)  += arm/neon/vp8_vpxyv12_copyframe_func_neon$(ASM)
-SCALE_SRCS-$(HAVE_NEON_ASM)  += arm/neon/vp8_vpxyv12_copysrcframe_func_neon$(ASM)
-SCALE_SRCS-$(HAVE_NEON_ASM)  += arm/neon/yv12extend_arm.c
-
 #mips(dspr2)
 SCALE_SRCS-$(HAVE_DSPR2)  += mips/dspr2/yv12extend_dspr2.c
 

diff --git a/vpx_scale/vpx_scale_rtcd.pl b/vpx_scale/vpx_scale_rtcd.pl
index 5a7f973..d4a2b81 100644
--- a/vpx_scale/vpx_scale_rtcd.pl
+++ b/vpx_scale/vpx_scale_rtcd.pl

@@ -19,8 +19,6 @@
 add_proto qw/void vp8_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf";
 
 add_proto qw/void vp8_yv12_copy_frame/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
-specialize qw/vp8_yv12_copy_frame neon_asm/;
-$vp8_yv12_copy_frame_neon_asm=vp8_yv12_copy_frame_neon;
 
 add_proto qw/void vpx_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";