Merge "configure.sh: avoid test -a/-o for compatibility"
diff --git a/libs.mk b/libs.mk
index 1e01639..25fbc2c 100644
--- a/libs.mk
+++ b/libs.mk
@@ -409,12 +409,16 @@
curl -L -o $@ $(call libvpx_test_data_url,$(@F))
testdata:: $(LIBVPX_TEST_DATA)
- $(qexec)if [ -x "$$(which sha1sum)" ]; then\
+ $(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\
+ [ -x "$$(which shasum)" ] && sha1sum=shasum;\
+ [ -x "$$(which sha1)" ] && sha1sum=sha1;\
+ if [ -n "$${sha1sum}" ]; then\
+ set -e;\
echo "Checking test data:";\
if [ -n "$(LIBVPX_TEST_DATA)" ]; then\
for f in $(call enabled,LIBVPX_TEST_DATA); do\
grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\
- (cd $(LIBVPX_TEST_DATA_PATH); sha1sum -c);\
+ (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\
done; \
fi; \
else\
diff --git a/test/decode_to_md5.sh b/test/decode_to_md5.sh
index 6cb7d0e..854b74f 100755
--- a/test/decode_to_md5.sh
+++ b/test/decode_to_md5.sh
@@ -44,8 +44,8 @@
[ -e "${output_file}" ] || return 1
- local md5_last_frame=$(tail -n1 "${output_file}")
- local actual_md5=$(echo "${md5_last_frame% *}" | tr -d [:space:])
+ local md5_last_frame="$(tail -n1 "${output_file}" | awk '{print $1}')"
+ local actual_md5="$(echo "${md5_last_frame}" | awk '{print $1}')"
[ "${actual_md5}" = "${expected_md5}" ] || return 1
}
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 98ac0e6..ee6289f 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -679,4 +679,5 @@
9a70e8b7d14fba9234d0e51dce876635413ce444 thaloundeskmtg_640_480_30.yuv
e7d315dbf4f3928779e0dc624311196d44491d32 niklas_1280_720_30.yuv
c77e4a26616add298a05dd5d12397be22c0e40c5 vp90-2-18-resize.ivf
-c77e4a26616add298a05dd5d12397be22c0e40c5 vp90-2-18-resize.ivf
+c12918cf0a716417fba2de35c3fc5ab90e52dfce vp90-2-18-resize.ivf.md5
+717da707afcaa1f692ff1946f291054eb75a4f06 screendata.y4m
diff --git a/test/test.mk b/test/test.mk
index 53d4057..057e1e8 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -99,8 +99,8 @@
LIBVPX_TEST_SRCS-yes += idct_test.cc
LIBVPX_TEST_SRCS-yes += intrapred_test.cc
-LIBVPX_TEST_SRCS-yes += scale_border_test.cc
LIBVPX_TEST_SRCS-yes += sixtap_predict_test.cc
+LIBVPX_TEST_SRCS-yes += vpx_scale_test.cc
endif # VP8
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 9dc7c6a..83b7435 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -756,6 +756,18 @@
::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2),
make_tuple(6, 6, subpel_avg_variance64x64_avx2)));
#endif // HAVE_AVX2
+#if HAVE_NEON
+const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon;
+INSTANTIATE_TEST_CASE_P(
+ NEON, VP9VarianceTest,
+ ::testing::Values(make_tuple(4, 4, variance16x16_neon)));
+
+const vp9_subpixvariance_fn_t subpel_variance16x16_neon =
+ vp9_sub_pixel_variance16x16_neon;
+INSTANTIATE_TEST_CASE_P(
+ NEON, VP9SubpelVarianceTest,
+ ::testing::Values(make_tuple(4, 4, subpel_variance16x16_neon)));
+#endif // HAVE_NEON
#endif // CONFIG_VP9_ENCODER
} // namespace vp9
diff --git a/test/scale_border_test.cc b/test/vpx_scale_test.cc
similarity index 66%
rename from test/scale_border_test.cc
rename to test/vpx_scale_test.cc
index cc9a69a..b3302d9 100644
--- a/test/scale_border_test.cc
+++ b/test/vpx_scale_test.cc
@@ -21,11 +21,12 @@
namespace {
typedef void (*ExtendFrameBorderFunc)(YV12_BUFFER_CONFIG *ybf);
+typedef void (*CopyFrameFunc)(const YV12_BUFFER_CONFIG *src_ybf,
+ YV12_BUFFER_CONFIG *dst_ybf);
-class ExtendBorderTest
- : public ::testing::TestWithParam<ExtendFrameBorderFunc> {
+class VpxScaleBase {
public:
- virtual ~ExtendBorderTest() {
+ virtual ~VpxScaleBase() {
libvpx_test::ClearSystemState();
}
@@ -35,7 +36,6 @@
vpx_memset(&img_, 0, sizeof(img_));
ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&img_, width_, height_,
VP8BORDERINPIXELS));
-
vpx_memset(img_.buffer_alloc, kBufFiller, img_.frame_size);
FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
img_.y_stride);
@@ -47,31 +47,25 @@
vpx_memset(&ref_img_, 0, sizeof(ref_img_));
ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&ref_img_, width_, height_,
VP8BORDERINPIXELS));
-
vpx_memset(ref_img_.buffer_alloc, kBufFiller, ref_img_.frame_size);
- FillPlane(ref_img_.y_buffer, ref_img_.y_crop_width, ref_img_.y_crop_height,
- ref_img_.y_stride);
- FillPlane(ref_img_.u_buffer,
- ref_img_.uv_crop_width, ref_img_.uv_crop_height,
- ref_img_.uv_stride);
- FillPlane(ref_img_.v_buffer,
- ref_img_.uv_crop_width, ref_img_.uv_crop_height,
- ref_img_.uv_stride);
+
+ vpx_memset(&cpy_img_, 0, sizeof(cpy_img_));
+ ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&cpy_img_, width_, height_,
+ VP8BORDERINPIXELS));
+ vpx_memset(cpy_img_.buffer_alloc, kBufFiller, cpy_img_.frame_size);
+ ReferenceCopyFrame();
}
void DeallocImage() {
vp8_yv12_de_alloc_frame_buffer(&img_);
vp8_yv12_de_alloc_frame_buffer(&ref_img_);
+ vp8_yv12_de_alloc_frame_buffer(&cpy_img_);
}
- private:
+ protected:
static const int kBufFiller = 123;
static const int kBufMax = kBufFiller - 1;
- virtual void SetUp() {
- extend_fn_ = GetParam();
- }
-
static void FillPlane(uint8_t *buf, int width, int height, int stride) {
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
@@ -80,24 +74,6 @@
}
}
- void ReferenceExtendBorder() {
- ExtendPlane(ref_img_.y_buffer,
- ref_img_.y_crop_width, ref_img_.y_crop_height,
- ref_img_.y_width, ref_img_.y_height,
- ref_img_.y_stride,
- ref_img_.border);
- ExtendPlane(ref_img_.u_buffer,
- ref_img_.uv_crop_width, ref_img_.uv_crop_height,
- ref_img_.uv_width, ref_img_.uv_height,
- ref_img_.uv_stride,
- ref_img_.border / 2);
- ExtendPlane(ref_img_.v_buffer,
- ref_img_.uv_crop_width, ref_img_.uv_crop_height,
- ref_img_.uv_width, ref_img_.uv_height,
- ref_img_.uv_stride,
- ref_img_.border / 2);
- }
-
static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height,
int width, int height, int stride, int padding) {
// Copy the outermost visible pixel to a distance of at least 'padding.'
@@ -136,17 +112,75 @@
}
}
+ void ReferenceExtendBorder() {
+ ExtendPlane(ref_img_.y_buffer,
+ ref_img_.y_crop_width, ref_img_.y_crop_height,
+ ref_img_.y_width, ref_img_.y_height,
+ ref_img_.y_stride,
+ ref_img_.border);
+ ExtendPlane(ref_img_.u_buffer,
+ ref_img_.uv_crop_width, ref_img_.uv_crop_height,
+ ref_img_.uv_width, ref_img_.uv_height,
+ ref_img_.uv_stride,
+ ref_img_.border / 2);
+ ExtendPlane(ref_img_.v_buffer,
+ ref_img_.uv_crop_width, ref_img_.uv_crop_height,
+ ref_img_.uv_width, ref_img_.uv_height,
+ ref_img_.uv_stride,
+ ref_img_.border / 2);
+ }
+
+ void ReferenceCopyFrame() {
+ // Copy img_ to ref_img_ and extend frame borders. This will be used for
+ // verifying extend_fn_ as well as copy_frame_fn_.
+ EXPECT_EQ(ref_img_.frame_size, img_.frame_size);
+ for (int y = 0; y < img_.y_crop_height; ++y) {
+ for (int x = 0; x < img_.y_crop_width; ++x) {
+ ref_img_.y_buffer[x + y * ref_img_.y_stride] =
+ img_.y_buffer[x + y * img_.y_stride];
+ }
+ }
+
+ for (int y = 0; y < img_.uv_crop_height; ++y) {
+ for (int x = 0; x < img_.uv_crop_width; ++x) {
+ ref_img_.u_buffer[x + y * ref_img_.uv_stride] =
+ img_.u_buffer[x + y * img_.uv_stride];
+ ref_img_.v_buffer[x + y * ref_img_.uv_stride] =
+ img_.v_buffer[x + y * img_.uv_stride];
+ }
+ }
+
+ ReferenceExtendBorder();
+ }
+
+ void CompareImages(const YV12_BUFFER_CONFIG actual) {
+ EXPECT_EQ(ref_img_.frame_size, actual.frame_size);
+ EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc,
+ ref_img_.frame_size));
+ }
+
+ YV12_BUFFER_CONFIG img_;
+ YV12_BUFFER_CONFIG ref_img_;
+ YV12_BUFFER_CONFIG cpy_img_;
+ int width_;
+ int height_;
+};
+
+class ExtendBorderTest
+ : public VpxScaleBase,
+ public ::testing::TestWithParam<ExtendFrameBorderFunc> {
+ public:
+ virtual ~ExtendBorderTest() {}
+
+ protected:
+ virtual void SetUp() {
+ extend_fn_ = GetParam();
+ }
+
void ExtendBorder() {
ASM_REGISTER_STATE_CHECK(extend_fn_(&img_));
}
- void CompareImages() {
- EXPECT_EQ(ref_img_.frame_size, img_.frame_size);
- EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, img_.buffer_alloc,
- ref_img_.frame_size));
- }
-
- protected:
void RunTest() {
#if ARCH_ARM
// Some arm devices OOM when trying to allocate the largest buffers.
@@ -160,17 +194,13 @@
ResetImage(kSizesToTest[w], kSizesToTest[h]);
ExtendBorder();
ReferenceExtendBorder();
- CompareImages();
+ CompareImages(img_);
DeallocImage();
}
}
}
- YV12_BUFFER_CONFIG img_;
- YV12_BUFFER_CONFIG ref_img_;
ExtendFrameBorderFunc extend_fn_;
- int width_;
- int height_;
};
TEST_P(ExtendBorderTest, ExtendBorder) {
@@ -179,4 +209,48 @@
INSTANTIATE_TEST_CASE_P(C, ExtendBorderTest,
::testing::Values(vp8_yv12_extend_frame_borders_c));
+
+class CopyFrameTest
+ : public VpxScaleBase,
+ public ::testing::TestWithParam<CopyFrameFunc> {
+ public:
+ virtual ~CopyFrameTest() {}
+
+ protected:
+ virtual void SetUp() {
+ copy_frame_fn_ = GetParam();
+ }
+
+ void CopyFrame() {
+ ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &cpy_img_));
+ }
+
+ void RunTest() {
+#if ARCH_ARM
+ // Some arm devices OOM when trying to allocate the largest buffers.
+ static const int kNumSizesToTest = 6;
+#else
+ static const int kNumSizesToTest = 7;
+#endif
+ static const int kSizesToTest[] = {1, 15, 33, 145, 512, 1025, 16383};
+ for (int h = 0; h < kNumSizesToTest; ++h) {
+ for (int w = 0; w < kNumSizesToTest; ++w) {
+ ResetImage(kSizesToTest[w], kSizesToTest[h]);
+ ReferenceCopyFrame();
+ CopyFrame();
+ CompareImages(cpy_img_);
+ DeallocImage();
+ }
+ }
+ }
+
+ CopyFrameFunc copy_frame_fn_;
+};
+
+TEST_P(CopyFrameTest, CopyFrame) {
+ ASSERT_NO_FATAL_FAILURE(RunTest());
+}
+
+INSTANTIATE_TEST_CASE_P(C, CopyFrameTest,
+ ::testing::Values(vp8_yv12_copy_frame_c));
} // namespace
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index e81c05e..298f50f 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -616,7 +616,7 @@
#if CONFIG_TEMPORAL_DENOISING
if (cpi->denoiser.aggressive_mode != 0 &&
- Q < cpi->denoiser.denoise_pars.qp_thresh) {
+ Q < (int)cpi->denoiser.denoise_pars.qp_thresh) {
// Under aggressive denoising mode, use segmentation to turn off loop
// filter below some qp thresh. The loop filter is turned off for all
// blocks that have been encoded as ZEROMV LAST x frames in a row,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index d587749..3b1ca16 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -420,7 +420,7 @@
specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x16 mmx avx2/, "$sse2_x86inc";
+specialize qw/vp9_variance16x16 mmx avx2 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";
@@ -435,7 +435,7 @@
specialize qw/vp9_get8x8var mmx/, "$sse2_x86inc";
add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get16x16var avx2/, "$sse2_x86inc";
+specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance8x4/, "$sse2_x86inc";
@@ -483,7 +483,7 @@
specialize qw/vp9_sub_pixel_avg_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance16x16 neon/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -715,7 +715,7 @@
specialize qw/vp9_subtract_block/, "$sse2_x86inc";
add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp/, "$ssse3_x86_64";
+specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64";
add_proto qw/void vp9_quantize_fp_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
new file mode 100644
index 0000000..2d5ec79
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include <math.h>
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_rd.h"
+
+void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+ int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ int zbin_oq_value, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int i;
+ // TODO(jingning) Decide the need of these arguments after the
+ // quantization process is completed.
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)zbin_oq_value;
+ (void)scan;
+
+ if (!skip_block) {
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+
+ const int16x8_t v_zero = vdupq_n_s16(0);
+ const int16x8_t v_one = vdupq_n_s16(1);
+ int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+ int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
+ int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
+ int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
+ // adjust for dc
+ v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
+ v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+ v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+
+ for (i = 0; i < count; i += 8) {
+ const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+ const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
+ const int16x8_t v_tmp = vqaddq_s16(v_abs_coeff, v_round);
+ const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp),
+ vget_low_s16(v_quant));
+ const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp),
+ vget_high_s16(v_quant));
+ const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16),
+ vshrn_n_s32(v_tmp_hi, 16));
+ const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+ const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+ const int16x8_t v_nz_iscan =
+ vandq_s16(vmvnq_s16(vreinterpretq_s16_u16(v_nz_mask)), v_iscan_plus1);
+ const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+ const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+ const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+
+ v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+
+ vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
+ vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
+ v_round = vmovq_n_s16(round_ptr[1]);
+ v_quant = vmovq_n_s16(quant_ptr[1]);
+ v_dequant = vmovq_n_s16(dequant_ptr[1]);
+ }
+ {
+ const int16x4_t v_eobmax_3210 =
+ vmax_s16(vget_low_s16(v_eobmax_76543210),
+ vget_high_s16(v_eobmax_76543210));
+ const int64x1_t v_eobmax_xx32 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+ const int16x4_t v_eobmax_tmp =
+ vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+ const int64x1_t v_eobmax_xxx3 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+ const int16x4_t v_eobmax_final =
+ vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+ *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+ }
+ } else {
+ vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+ vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+ *eob_ptr = 0;
+ }
+}
diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c
new file mode 100644
index 0000000..f687118
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_variance_neon.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_filter.h"
+
+#include "vp9/encoder/vp9_variance.h"
+
+enum { kWidth16 = 16 };
+enum { kHeight16 = 16 };
+enum { kHeight16PlusOne = 17 };
+enum { kPixelStepOne = 1 };
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+ const int32x4_t a = vpaddlq_s16(v_16x8);
+ const int64x2_t b = vpaddlq_s32(a);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+ const int64x2_t b = vpaddlq_s32(v_32x4);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+static void variance_neon_w8(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ int w, int h, unsigned int *sse, int *sum) {
+ int i, j;
+ int16x8_t v_sum = vdupq_n_s16(0);
+ int32x4_t v_sse_lo = vdupq_n_s32(0);
+ int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const uint8x8_t v_a = vld1_u8(&a[j]);
+ const uint8x8_t v_b = vld1_u8(&b[j]);
+ const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+ const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+ v_sum = vaddq_s16(v_sum, sv_diff);
+ v_sse_lo = vmlal_s16(v_sse_lo,
+ vget_low_s16(sv_diff),
+ vget_low_s16(sv_diff));
+ v_sse_hi = vmlal_s16(v_sse_hi,
+ vget_high_s16(sv_diff),
+ vget_high_s16(sv_diff));
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+
+ *sum = horizontal_add_s16x8(v_sum);
+ *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum) {
+ variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16,
+ kHeight16, sse, sum);
+}
+
+unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum);
+ return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16));
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
+ uint8_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const int16_t *vp9_filter) {
+ const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);
+ const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]);
+ unsigned int i;
+ for (i = 0; i < output_height; ++i) {
+ const uint8x16_t src_0 = vld1q_u8(&src_ptr[0]);
+ const uint8x16_t src_1 = vld1q_u8(&src_ptr[pixel_step]);
+ const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
+ const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
+ const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
+ const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
+ const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
+ const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
+ vst1q_u8(&output_ptr[0], vcombine_u8(out_lo, out_hi));
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+}
+
+unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
+ int src_stride,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ DECLARE_ALIGNED_ARRAY(kWidth16, uint8_t, temp2, kHeight16 * kWidth16);
+ DECLARE_ALIGNED_ARRAY(kWidth16, uint8_t, fdata3, kHeight16PlusOne * kWidth16);
+
+ var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
+ kHeight16PlusOne, kWidth16,
+ BILINEAR_FILTERS_2TAP(xoffset));
+ var_filter_block2d_bil_w16(fdata3, temp2, kWidth16, kWidth16, kHeight16,
+ kWidth16, BILINEAR_FILTERS_2TAP(yoffset));
+ return vp9_variance16x16_neon(temp2, kWidth16, dst, dst_stride, sse);
+}
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 584bcb8..74eaae5 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1836,7 +1836,7 @@
BLOCK_SIZE max_size = BLOCK_8X8;
int bsl = mi_width_log2(BLOCK_64X64);
const int search_range_ctrl = (((mi_row + mi_col) >> bsl) +
- get_chessboard_index(cm)) % 2;
+ get_chessboard_index(cm->current_video_frame)) & 0x1;
// Trap case where we do not have a prediction.
if (search_range_ctrl &&
(left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) {
@@ -1880,6 +1880,60 @@
*max_block_size = max_size;
}
+// TODO(jingning) refactor functions setting partition search range
+static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ BLOCK_SIZE *min_bs, BLOCK_SIZE *max_bs) {
+ int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ int mi_height = num_8x8_blocks_high_lookup[bsize];
+ int idx, idy;
+
+ MODE_INFO *mi;
+ MODE_INFO **prev_mi =
+ &cm->prev_mi_grid_visible[mi_row * cm->mi_stride + mi_col];
+ BLOCK_SIZE bs, min_size, max_size;
+
+ min_size = BLOCK_64X64;
+ max_size = BLOCK_4X4;
+
+ if (prev_mi) {
+ for (idy = 0; idy < mi_height; ++idy) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ mi = prev_mi[idy * cm->mi_stride + idx];
+ bs = mi ? mi->mbmi.sb_type : bsize;
+ min_size = MIN(min_size, bs);
+ max_size = MAX(max_size, bs);
+ }
+ }
+ }
+
+ if (xd->left_available) {
+ for (idy = 0; idy < mi_height; ++idy) {
+ mi = xd->mi[idy * cm->mi_stride - 1];
+ bs = mi ? mi->mbmi.sb_type : bsize;
+ min_size = MIN(min_size, bs);
+ max_size = MAX(max_size, bs);
+ }
+ }
+
+ if (xd->up_available) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ mi = xd->mi[idx - cm->mi_stride];
+ bs = mi ? mi->mbmi.sb_type : bsize;
+ min_size = MIN(min_size, bs);
+ max_size = MAX(max_size, bs);
+ }
+ }
+
+ if (min_size == max_size) {
+ min_size = min_partition_size[min_size];
+ max_size = max_partition_size[max_size];
+ }
+
+ *min_bs = min_size;
+ *max_bs = max_size;
+}
+
static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
vpx_memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
}
@@ -1888,6 +1942,15 @@
vpx_memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
}
+#if CONFIG_FP_MB_STATS
+const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] =
+ {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4};
+const int num_16x16_blocks_high_lookup[BLOCK_SIZES] =
+ {1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4};
+const int qindex_skip_threshold_lookup[BLOCK_SIZES] =
+ {0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120};
+#endif
+
// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
// unlikely to be selected depending on previous rate-distortion optimization
// results, for encoding speed-up.
@@ -1917,6 +1980,9 @@
const int xss = x->e_mbd.plane[1].subsampling_x;
const int yss = x->e_mbd.plane[1].subsampling_y;
+ BLOCK_SIZE min_size = cpi->sf.min_partition_size;
+ BLOCK_SIZE max_size = cpi->sf.max_partition_size;
+
int partition_none_allowed = !force_horz_split && !force_vert_split;
int partition_horz_allowed = !force_vert_split && yss <= xss &&
bsize >= BLOCK_8X8;
@@ -1931,18 +1997,24 @@
set_offsets(cpi, tile, mi_row, mi_col, bsize);
x->mb_energy = vp9_block_energy(cpi, x, bsize);
}
+
+ if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
+ int cb_partition_search_ctrl = ((pc_tree->index == 0 || pc_tree->index == 3)
+ + get_chessboard_index(cm->current_video_frame)) & 0x1;
+
+ if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size)
+ set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size);
+ }
+
// Determine partition types in search according to the speed features.
// The threshold set here has to be of square block size.
if (cpi->sf.auto_min_max_partition_size) {
- partition_none_allowed &= (bsize <= cpi->sf.max_partition_size &&
- bsize >= cpi->sf.min_partition_size);
- partition_horz_allowed &= ((bsize <= cpi->sf.max_partition_size &&
- bsize > cpi->sf.min_partition_size) ||
+ partition_none_allowed &= (bsize <= max_size && bsize >= min_size);
+ partition_horz_allowed &= ((bsize <= max_size && bsize > min_size) ||
force_horz_split);
- partition_vert_allowed &= ((bsize <= cpi->sf.max_partition_size &&
- bsize > cpi->sf.min_partition_size) ||
+ partition_vert_allowed &= ((bsize <= max_size && bsize > min_size) ||
force_vert_split);
- do_split &= bsize > cpi->sf.min_partition_size;
+ do_split &= bsize > min_size;
}
if (cpi->sf.use_square_partition_only) {
partition_horz_allowed &= force_horz_split;
@@ -1993,6 +2065,52 @@
do_split = 0;
do_rect = 0;
}
+
+#if CONFIG_FP_MB_STATS
+ // Check if every 16x16 first pass block statistics has zero
+ // motion and the corresponding first pass residue is small enough.
+ // If that is the case, check the difference variance between the
+ // current frame and the last frame. If the variance is small enough,
+ // stop further splitting in RD optimization
+ if (cpi->use_fp_mb_stats && do_split != 0 &&
+ cm->base_qindex > qindex_skip_threshold_lookup[bsize]) {
+ VP9_COMMON *cm = &cpi->common;
+ int mb_row = mi_row >> 1;
+ int mb_col = mi_col >> 1;
+ int mb_row_end =
+ MIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+ int mb_col_end =
+ MIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+ int r, c;
+
+ int skip = 1;
+ for (r = mb_row; r < mb_row_end; r++) {
+ for (c = mb_col; c < mb_col_end; c++) {
+ const int mb_index = r * cm->mb_cols + c;
+ if ((cpi->twopass.this_frame_mb_stats[mb_index] &
+ FPMB_NONZERO_MOTION_MASK) ||
+ !(cpi->twopass.this_frame_mb_stats[mb_index] &
+ FPMB_ERROR_LEVEL0_MASK)) {
+ skip = 0;
+ break;
+ }
+ }
+ if (skip == 0) {
+ break;
+ }
+ }
+ if (skip) {
+ unsigned int var;
+ set_offsets(cpi, tile, mi_row, mi_col, bsize);
+ var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src,
+ mi_row, mi_col, bsize);
+ if (var < 8) {
+ do_split = 0;
+ do_rect = 0;
+ }
+ }
+ }
+#endif
}
}
restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -2030,6 +2148,7 @@
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, ctx);
+ pc_tree->split[i]->index = i;
rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
subsize, &this_rate, &this_dist, i != 3,
best_rd - sum_rd, pc_tree->split[i]);
@@ -2225,6 +2344,7 @@
}
vp9_zero(cpi->mb.pred_mv);
+ cpi->pc_root->index = 0;
if ((sf->partition_search_type == SEARCH_PARTITION &&
sf->use_lastframe_partitioning) ||
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 4b3f2ad..2a6c4b3 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -551,8 +551,8 @@
: 0];
}
-static INLINE int get_chessboard_index(const VP9_COMMON *cm) {
- return cm->current_video_frame % 2;
+static INLINE int get_chessboard_index(const int frame_index) {
+ return frame_index & 0x1;
}
#ifdef __cplusplus
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 30a0e9d..7a16001 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -394,7 +394,8 @@
INTERP_FILTER filter_ref = cm->interp_filter;
int bsl = mi_width_log2(bsize);
const int pred_filter_search = cm->interp_filter == SWITCHABLE ?
- (((mi_row + mi_col) >> bsl) + get_chessboard_index(cm)) % 2 : 0;
+ (((mi_row + mi_col) >> bsl) +
+ get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
int const_motion[MAX_REF_FRAMES] = { 0 };
int bh = num_4x4_blocks_high_lookup[bsize] << 2;
int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
@@ -409,6 +410,10 @@
PRED_BUFFER *this_mode_pred = NULL;
int i;
+ // CTX is used by the temporal denoiser which is currently being developed.
+ // TODO(jbb): when temporal denoiser is finished and in the default build
+ // remove the following line;
+ (void) ctx;
if (cpi->sf.reuse_inter_pred_sby) {
for (i = 0; i < 3; i++) {
tmp[i].data = &pred_buf[pixels_in_block * i];
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f65ac7b..c6580ee 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2025,7 +2025,8 @@
int bsl = mi_width_log2_lookup[bsize];
int pred_filter_search = cpi->sf.cb_pred_filter_search ?
- (((mi_row + mi_col) >> bsl)) & 0x01 : 0;
+ (((mi_row + mi_col) >> bsl) +
+ get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
if (pred_filter_search) {
INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 7315dd4..6ca1bc5 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -110,10 +110,12 @@
if (speed >= 3) {
sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
: USE_LARGESTALL;
- if (MIN(cm->width, cm->height) >= 720)
+ if (MIN(cm->width, cm->height) >= 720) {
sf->disable_split_mask = DISABLE_ALL_SPLIT;
- else
+ sf->cb_partition_search = frame_is_boosted(cpi) ? 0 : 1;
+ } else {
sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+ }
sf->adaptive_pred_interp_filter = 0;
sf->cb_pred_filter_search = 1;
@@ -334,6 +336,7 @@
sf->adaptive_motion_search = 0;
sf->adaptive_pred_interp_filter = 0;
sf->cb_pred_filter_search = 0;
+ sf->cb_partition_search = 0;
sf->use_quant_fp = 0;
sf->reference_masking = 0;
sf->partition_search_type = SEARCH_PARTITION;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 929acaf..de731ce 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -286,6 +286,8 @@
// Chessboard pattern prediction filter type search
int cb_pred_filter_search;
+ int cb_partition_search;
+
// Fast quantization process path
int use_quant_fp;
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index b3dc0b1..197fdba 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -305,6 +305,16 @@
return VPX_CODEC_OK;
}
+static int get_image_bps(const vpx_image_t *img) {
+ switch (img->fmt) {
+ case VPX_IMG_FMT_YV12:
+ case VPX_IMG_FMT_I420: return 12;
+ case VPX_IMG_FMT_I422: return 16;
+ case VPX_IMG_FMT_I444: return 24;
+ default: assert(0 && "Invalid image format");
+ }
+ return 0;
+}
static vpx_codec_err_t set_encoder_config(
VP9EncoderConfig *oxcf,
@@ -672,16 +682,6 @@
priv->extra_cfg = extracfg_map[i].cfg;
priv->extra_cfg.pkt_list = &priv->pkt_list.head;
- // Maximum buffer size approximated based on having multiple ARF.
- priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 8;
-
- if (priv->cx_data_sz < 4096)
- priv->cx_data_sz = 4096;
-
- priv->cx_data = (unsigned char *)malloc(priv->cx_data_sz);
- if (priv->cx_data == NULL)
- return VPX_CODEC_MEM_ERROR;
-
vp9_initialize_enc();
res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
@@ -806,8 +806,24 @@
unsigned long deadline) {
vpx_codec_err_t res = VPX_CODEC_OK;
- if (img)
+ if (img != NULL) {
res = validate_img(ctx, img);
+ // TODO(jzern) the checks related to cpi's validity should be treated as a
+ // failure condition, encoder setup is done fully in init() currently.
+ if (res == VPX_CODEC_OK && ctx->cpi != NULL && ctx->cx_data == NULL) {
+ // There's no codec control for multiple alt-refs so check the encoder
+ // instance for its status to determine the compressed data size.
+ ctx->cx_data_sz = ctx->cfg.g_w * ctx->cfg.g_h *
+ get_image_bps(img) / 8 *
+ (ctx->cpi->multi_arf_allowed ? 8 : 2);
+ if (ctx->cx_data_sz < 4096) ctx->cx_data_sz = 4096;
+
+ ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz);
+ if (ctx->cx_data == NULL) {
+ return VPX_CODEC_MEM_ERROR;
+ }
+ }
+ }
pick_quickcompress_mode(ctx, duration, deadline);
vpx_codec_pkt_list_init(&ctx->pkt_list);
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 0ea28d3..c8e7c5e 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -131,5 +131,7 @@
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm
deleted file mode 100644
index 696f47a..0000000
--- a/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm
+++ /dev/null
@@ -1,233 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_yv12_copy_frame_func_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- INCLUDE vpx_scale_asm_offsets.asm
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp8_yv12_copy_frame_func_neon(const YV12_BUFFER_CONFIG *src_ybc,
-; YV12_BUFFER_CONFIG *dst_ybc);
-
-|vp8_yv12_copy_frame_func_neon| PROC
- push {r4 - r11, lr}
- vpush {d8 - d15}
-
- sub sp, sp, #16
-
- ;Copy Y plane
- ldr r8, [r0, #yv12_buffer_config_u_buffer] ;srcptr1
- ldr r9, [r1, #yv12_buffer_config_u_buffer] ;srcptr1
- ldr r10, [r0, #yv12_buffer_config_v_buffer] ;srcptr1
- ldr r11, [r1, #yv12_buffer_config_v_buffer] ;srcptr1
-
- ldr r4, [r0, #yv12_buffer_config_y_height]
- ldr r5, [r0, #yv12_buffer_config_y_width]
- ldr r6, [r0, #yv12_buffer_config_y_stride]
- ldr r7, [r1, #yv12_buffer_config_y_stride]
- ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
- ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1
-
- str r8, [sp]
- str r9, [sp, #4]
- str r10, [sp, #8]
- str r11, [sp, #12]
-
- ; copy two rows at one time
- mov lr, r4, lsr #1
-
-cp_src_to_dst_height_loop
- mov r8, r2
- mov r9, r3
- add r10, r2, r6
- add r11, r3, r7
- movs r12, r5, lsr #7
- ble extra_cp_needed ; y_width < 128
-
-cp_src_to_dst_width_loop
- vld1.8 {q0, q1}, [r8]!
- vld1.8 {q8, q9}, [r10]!
- vld1.8 {q2, q3}, [r8]!
- vld1.8 {q10, q11}, [r10]!
- vld1.8 {q4, q5}, [r8]!
- vld1.8 {q12, q13}, [r10]!
- vld1.8 {q6, q7}, [r8]!
- vld1.8 {q14, q15}, [r10]!
-
- subs r12, r12, #1
-
- vst1.8 {q0, q1}, [r9]!
- vst1.8 {q8, q9}, [r11]!
- vst1.8 {q2, q3}, [r9]!
- vst1.8 {q10, q11}, [r11]!
- vst1.8 {q4, q5}, [r9]!
- vst1.8 {q12, q13}, [r11]!
- vst1.8 {q6, q7}, [r9]!
- vst1.8 {q14, q15}, [r11]!
-
- bne cp_src_to_dst_width_loop
-
- subs lr, lr, #1
- add r2, r2, r6, lsl #1
- add r3, r3, r7, lsl #1
-
- bne cp_src_to_dst_height_loop
-
-extra_cp_needed
- ands r10, r5, #0x7f ;check to see if extra copy is needed
- sub r11, r5, r10
- ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
- ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1
- bne extra_cp_src_to_dst_width
-end_of_cp_src_to_dst
-
-;Copy U & V planes
- ldr r2, [sp] ;srcptr1
- ldr r3, [sp, #4] ;dstptr1
- mov r4, r4, lsr #1 ;src uv_height
- mov r5, r5, lsr #1 ;src uv_width
- mov r6, r6, lsr #1 ;src uv_stride
- mov r7, r7, lsr #1 ;dst uv_stride
-
- mov r1, #2
-
-cp_uv_loop
-
- ;copy two rows at one time
- mov lr, r4, lsr #1
-
-cp_src_to_dst_height_uv_loop
- mov r8, r2
- mov r9, r3
- add r10, r2, r6
- add r11, r3, r7
- movs r12, r5, lsr #6
- ble extra_uv_cp_needed
-
-cp_src_to_dst_width_uv_loop
- vld1.8 {q0, q1}, [r8]!
- vld1.8 {q8, q9}, [r10]!
- vld1.8 {q2, q3}, [r8]!
- vld1.8 {q10, q11}, [r10]!
-
- subs r12, r12, #1
-
- vst1.8 {q0, q1}, [r9]!
- vst1.8 {q8, q9}, [r11]!
- vst1.8 {q2, q3}, [r9]!
- vst1.8 {q10, q11}, [r11]!
-
- bne cp_src_to_dst_width_uv_loop
-
- subs lr, lr, #1
- add r2, r2, r6, lsl #1
- add r3, r3, r7, lsl #1
-
- bne cp_src_to_dst_height_uv_loop
-
-extra_uv_cp_needed
- ands r10, r5, #0x3f ;check to see if extra copy is needed
- sub r11, r5, r10
- ldr r2, [sp] ;srcptr1
- ldr r3, [sp, #4] ;dstptr1
- bne extra_cp_src_to_dst_uv_width
-end_of_cp_src_to_dst_uv
-
- subs r1, r1, #1
-
- addne sp, sp, #8
-
- ldrne r2, [sp] ;srcptr1
- ldrne r3, [sp, #4] ;dstptr1
-
- bne cp_uv_loop
-
- add sp, sp, #8
-
- vpop {d8 - d15}
- pop {r4 - r11, pc}
-
-;=============================
-extra_cp_src_to_dst_width
- add r2, r2, r11
- add r3, r3, r11
- add r0, r8, r6
- add r11, r9, r7
-
- mov lr, r4, lsr #1
-extra_cp_src_to_dst_height_loop
- mov r8, r2
- mov r9, r3
- add r0, r8, r6
- add r11, r9, r7
-
- mov r12, r10
-
-extra_cp_src_to_dst_width_loop
- vld1.8 {q0}, [r8]!
- vld1.8 {q1}, [r0]!
-
- subs r12, r12, #16
-
- vst1.8 {q0}, [r9]!
- vst1.8 {q1}, [r11]!
- bne extra_cp_src_to_dst_width_loop
-
- subs lr, lr, #1
-
- add r2, r2, r6, lsl #1
- add r3, r3, r7, lsl #1
-
- bne extra_cp_src_to_dst_height_loop
-
- b end_of_cp_src_to_dst
-
-;=================================
-extra_cp_src_to_dst_uv_width
- add r2, r2, r11
- add r3, r3, r11
- add r0, r8, r6
- add r11, r9, r7
-
- mov lr, r4, lsr #1
-extra_cp_src_to_dst_height_uv_loop
- mov r8, r2
- mov r9, r3
- add r0, r8, r6
- add r11, r9, r7
-
- mov r12, r10
-
-extra_cp_src_to_dst_width_uv_loop
- vld1.8 {d0}, [r8]!
- vld1.8 {d1}, [r0]!
-
- subs r12, r12, #8
-
- vst1.8 {d0}, [r9]!
- vst1.8 {d1}, [r11]!
- bne extra_cp_src_to_dst_width_uv_loop
-
- subs lr, lr, #1
-
- add r2, r2, r6, lsl #1
- add r3, r3, r7, lsl #1
-
- bne extra_cp_src_to_dst_height_uv_loop
-
- b end_of_cp_src_to_dst_uv
-
- ENDP
- END
diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm
deleted file mode 100644
index d3306b6..0000000
--- a/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm
+++ /dev/null
@@ -1,259 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_yv12_copy_src_frame_func_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- INCLUDE vpx_scale_asm_offsets.asm
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: This function is used to copy source data in src_buffer[i] at beginning
-;of the encoding. The buffer has a width and height of cpi->oxcf.Width and
-;cpi->oxcf.Height, which can be ANY numbers(NOT always multiples of 16 or 4).
-
-;void vp8_yv12_copy_src_frame_func_neon(const YV12_BUFFER_CONFIG *src_ybc,
-; YV12_BUFFER_CONFIG *dst_ybc);
-
-|vp8_yv12_copy_src_frame_func_neon| PROC
- push {r4 - r11, lr}
- vpush {d8 - d15}
-
- ;Copy Y plane
- ldr r4, [r0, #yv12_buffer_config_y_height]
- ldr r5, [r0, #yv12_buffer_config_y_width]
- ldr r6, [r0, #yv12_buffer_config_y_stride]
- ldr r7, [r1, #yv12_buffer_config_y_stride]
- ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
- ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1
-
- add r10, r2, r6 ;second row src
- add r11, r3, r7 ;second row dst
- mov r6, r6, lsl #1
- mov r7, r7, lsl #1
- sub r6, r6, r5 ;adjust stride
- sub r7, r7, r5
-
- ; copy two rows at one time
- mov lr, r4, lsr #1
-
-cp_src_to_dst_height_loop
- mov r12, r5
-
-cp_width_128_loop
- vld1.8 {q0, q1}, [r2]!
- vld1.8 {q4, q5}, [r10]!
- vld1.8 {q2, q3}, [r2]!
- vld1.8 {q6, q7}, [r10]!
- vld1.8 {q8, q9}, [r2]!
- vld1.8 {q12, q13}, [r10]!
- vld1.8 {q10, q11}, [r2]!
- vld1.8 {q14, q15}, [r10]!
- sub r12, r12, #128
- cmp r12, #128
- vst1.8 {q0, q1}, [r3]!
- vst1.8 {q4, q5}, [r11]!
- vst1.8 {q2, q3}, [r3]!
- vst1.8 {q6, q7}, [r11]!
- vst1.8 {q8, q9}, [r3]!
- vst1.8 {q12, q13}, [r11]!
- vst1.8 {q10, q11}, [r3]!
- vst1.8 {q14, q15}, [r11]!
- bhs cp_width_128_loop
-
- cmp r12, #0
- beq cp_width_done
-
-cp_width_8_loop
- vld1.8 {d0}, [r2]!
- vld1.8 {d1}, [r10]!
- sub r12, r12, #8
- cmp r12, #8
- vst1.8 {d0}, [r3]!
- vst1.8 {d1}, [r11]!
- bhs cp_width_8_loop
-
- cmp r12, #0
- beq cp_width_done
-
-cp_width_1_loop
- ldrb r8, [r2], #1
- subs r12, r12, #1
- strb r8, [r3], #1
- ldrb r8, [r10], #1
- strb r8, [r11], #1
- bne cp_width_1_loop
-
-cp_width_done
- subs lr, lr, #1
- add r2, r2, r6
- add r3, r3, r7
- add r10, r10, r6
- add r11, r11, r7
- bne cp_src_to_dst_height_loop
-
-;copy last line for Y if y_height is odd
- tst r4, #1
- beq cp_width_done_1
- mov r12, r5
-
-cp_width_128_loop_1
- vld1.8 {q0, q1}, [r2]!
- vld1.8 {q2, q3}, [r2]!
- vld1.8 {q8, q9}, [r2]!
- vld1.8 {q10, q11}, [r2]!
- sub r12, r12, #128
- cmp r12, #128
- vst1.8 {q0, q1}, [r3]!
- vst1.8 {q2, q3}, [r3]!
- vst1.8 {q8, q9}, [r3]!
- vst1.8 {q10, q11}, [r3]!
- bhs cp_width_128_loop_1
-
- cmp r12, #0
- beq cp_width_done_1
-
-cp_width_8_loop_1
- vld1.8 {d0}, [r2]!
- sub r12, r12, #8
- cmp r12, #8
- vst1.8 {d0}, [r3]!
- bhs cp_width_8_loop_1
-
- cmp r12, #0
- beq cp_width_done_1
-
-cp_width_1_loop_1
- ldrb r8, [r2], #1
- subs r12, r12, #1
- strb r8, [r3], #1
- bne cp_width_1_loop_1
-cp_width_done_1
-
-;Copy U & V planes
- ldr r4, [r0, #yv12_buffer_config_uv_height]
- ldr r5, [r0, #yv12_buffer_config_uv_width]
- ldr r6, [r0, #yv12_buffer_config_uv_stride]
- ldr r7, [r1, #yv12_buffer_config_uv_stride]
- ldr r2, [r0, #yv12_buffer_config_u_buffer] ;srcptr1
- ldr r3, [r1, #yv12_buffer_config_u_buffer] ;dstptr1
-
- add r10, r2, r6 ;second row src
- add r11, r3, r7 ;second row dst
- mov r6, r6, lsl #1
- mov r7, r7, lsl #1
- sub r6, r6, r5 ;adjust stride
- sub r7, r7, r5
-
- mov r9, #2
-
-cp_uv_loop
- ;copy two rows at one time
- mov lr, r4, lsr #1
-
-cp_src_to_dst_height_uv_loop
- mov r12, r5
-
-cp_width_uv_64_loop
- vld1.8 {q0, q1}, [r2]!
- vld1.8 {q4, q5}, [r10]!
- vld1.8 {q2, q3}, [r2]!
- vld1.8 {q6, q7}, [r10]!
- sub r12, r12, #64
- cmp r12, #64
- vst1.8 {q0, q1}, [r3]!
- vst1.8 {q4, q5}, [r11]!
- vst1.8 {q2, q3}, [r3]!
- vst1.8 {q6, q7}, [r11]!
- bhs cp_width_uv_64_loop
-
- cmp r12, #0
- beq cp_width_uv_done
-
-cp_width_uv_8_loop
- vld1.8 {d0}, [r2]!
- vld1.8 {d1}, [r10]!
- sub r12, r12, #8
- cmp r12, #8
- vst1.8 {d0}, [r3]!
- vst1.8 {d1}, [r11]!
- bhs cp_width_uv_8_loop
-
- cmp r12, #0
- beq cp_width_uv_done
-
-cp_width_uv_1_loop
- ldrb r8, [r2], #1
- subs r12, r12, #1
- strb r8, [r3], #1
- ldrb r8, [r10], #1
- strb r8, [r11], #1
- bne cp_width_uv_1_loop
-
-cp_width_uv_done
- subs lr, lr, #1
- add r2, r2, r6
- add r3, r3, r7
- add r10, r10, r6
- add r11, r11, r7
- bne cp_src_to_dst_height_uv_loop
-
-;copy last line for U & V if uv_height is odd
- tst r4, #1
- beq cp_width_uv_done_1
- mov r12, r5
-
-cp_width_uv_64_loop_1
- vld1.8 {q0, q1}, [r2]!
- vld1.8 {q2, q3}, [r2]!
- sub r12, r12, #64
- cmp r12, #64
- vst1.8 {q0, q1}, [r3]!
- vst1.8 {q2, q3}, [r3]!
- bhs cp_width_uv_64_loop_1
-
- cmp r12, #0
- beq cp_width_uv_done_1
-
-cp_width_uv_8_loop_1
- vld1.8 {d0}, [r2]!
- sub r12, r12, #8
- cmp r12, #8
- vst1.8 {d0}, [r3]!
- bhs cp_width_uv_8_loop_1
-
- cmp r12, #0
- beq cp_width_uv_done_1
-
-cp_width_uv_1_loop_1
- ldrb r8, [r2], #1
- subs r12, r12, #1
- strb r8, [r3], #1
- bne cp_width_uv_1_loop_1
-cp_width_uv_done_1
-
- subs r9, r9, #1
- ldrne r2, [r0, #yv12_buffer_config_v_buffer] ;srcptr1
- ldrne r3, [r1, #yv12_buffer_config_v_buffer] ;dstptr1
- ldrne r10, [r0, #yv12_buffer_config_uv_stride]
- ldrne r11, [r1, #yv12_buffer_config_uv_stride]
-
- addne r10, r2, r10 ;second row src
- addne r11, r3, r11 ;second row dst
-
- bne cp_uv_loop
-
- vpop {d8 - d15}
- pop {r4 - r11, pc}
-
- ENDP
- END
diff --git a/vpx_scale/arm/neon/yv12extend_arm.c b/vpx_scale/arm/neon/yv12extend_arm.c
deleted file mode 100644
index d408eb3..0000000
--- a/vpx_scale/arm/neon/yv12extend_arm.c
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_scale_rtcd.h"
-
-extern void vp8_yv12_copy_frame_func_neon(
- const struct yv12_buffer_config *src_ybc,
- struct yv12_buffer_config *dst_ybc);
-
-void vp8_yv12_copy_frame_neon(const struct yv12_buffer_config *src_ybc,
- struct yv12_buffer_config *dst_ybc) {
- vp8_yv12_copy_frame_func_neon(src_ybc, dst_ybc);
- vp8_yv12_extend_frame_borders_c(dst_ybc);
-}
diff --git a/vpx_scale/vpx_scale.mk b/vpx_scale/vpx_scale.mk
index 1fa41af..0a1594b 100644
--- a/vpx_scale/vpx_scale.mk
+++ b/vpx_scale/vpx_scale.mk
@@ -9,11 +9,6 @@
SCALE_SRCS-yes += vpx_scale_rtcd.c
SCALE_SRCS-yes += vpx_scale_rtcd.pl
-#neon
-SCALE_SRCS-$(HAVE_NEON_ASM) += arm/neon/vp8_vpxyv12_copyframe_func_neon$(ASM)
-SCALE_SRCS-$(HAVE_NEON_ASM) += arm/neon/vp8_vpxyv12_copysrcframe_func_neon$(ASM)
-SCALE_SRCS-$(HAVE_NEON_ASM) += arm/neon/yv12extend_arm.c
-
#mips(dspr2)
SCALE_SRCS-$(HAVE_DSPR2) += mips/dspr2/yv12extend_dspr2.c
diff --git a/vpx_scale/vpx_scale_rtcd.pl b/vpx_scale/vpx_scale_rtcd.pl
index 5a7f973..d4a2b81 100644
--- a/vpx_scale/vpx_scale_rtcd.pl
+++ b/vpx_scale/vpx_scale_rtcd.pl
@@ -19,8 +19,6 @@
add_proto qw/void vp8_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf";
add_proto qw/void vp8_yv12_copy_frame/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
-specialize qw/vp8_yv12_copy_frame neon_asm/;
-$vp8_yv12_copy_frame_neon_asm=vp8_yv12_copy_frame_neon;
add_proto qw/void vpx_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";