Merge "Adds config opt for highbitdepth + misc. vpx"
diff --git a/examples/vp8_multi_resolution_encoder.c b/examples/vp8_multi_resolution_encoder.c
index 783c9d3..7c050fa 100644
--- a/examples/vp8_multi_resolution_encoder.c
+++ b/examples/vp8_multi_resolution_encoder.c
@@ -31,6 +31,14 @@
#include "./tools_common.h"
#include "./video_writer.h"
+// The input video frame is downsampled several times to generate a
+// multi-level hierarchical structure. kNumEncoders is defined as the number
+// of encoding levels required. For example, if the size of input video is
+// 1280x720, kNumEncoders is 3, and down-sampling factor is 2, the encoder
+// outputs 3 bitstreams with resolution of 1280x720(level 0),
+// 640x360(level 1), and 320x180(level 2) respectively.
+#define kNumEncoders 3
+
static const char *exec_name;
void usage_exit() {
@@ -41,14 +49,6 @@
}
int main(int argc, char *argv[]) {
- // The input video frame is downsampled several times to generate a
- // multi-level hierarchical structure. kNumEncoders is defined as the number
- // of encoding levels required. For example, if the size of input video is
- // 1280x720, kNumEncoders is 3, and down-sampling factor is 2, the encoder
- // outputs 3 bitstreams with resolution of 1280x720(level 0),
- // 640x360(level 1), and 320x180(level 2) respectively.
- static const int kNumEncoders = 3;
-
int frame_cnt = 0;
FILE *infile = NULL;
VpxVideoWriter *writers[kNumEncoders];
diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c
index f37ca63..5840c2b 100644
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -25,22 +25,18 @@
extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
#endif
-#if HAVE_NEON_ASM || HAVE_NEON
+#if HAVE_NEON
typedef void loopfilter_y_neon(unsigned char *src, int pitch,
unsigned char blimit, unsigned char limit, unsigned char thresh);
typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
unsigned char blimit, unsigned char limit, unsigned char thresh,
unsigned char *v);
-#endif
-#if HAVE_NEON_ASM
extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
-#endif
-#if HAVE_NEON
extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
@@ -150,9 +146,7 @@
if (u_ptr)
vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
}
-#endif
-#if HAVE_NEON_ASM
/* Horizontal B Filtering */
void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
diff --git a/vp8/common/arm/neon/loopfilter_neon.asm b/vp8/common/arm/neon/loopfilter_neon.asm
deleted file mode 100644
index c4f09c7..0000000
--- a/vp8/common/arm/neon/loopfilter_neon.asm
+++ /dev/null
@@ -1,409 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_loop_filter_horizontal_edge_y_neon|
- EXPORT |vp8_loop_filter_horizontal_edge_uv_neon|
- EXPORT |vp8_loop_filter_vertical_edge_y_neon|
- EXPORT |vp8_loop_filter_vertical_edge_uv_neon|
- ARM
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src
-; r1 int pitch
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-|vp8_loop_filter_horizontal_edge_y_neon| PROC
- push {lr}
- vpush {d8-d15}
-
- vdup.u8 q0, r2 ; duplicate blimit
- vdup.u8 q1, r3 ; duplicate limit
- sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
- ldr r3, [sp, #68] ; load thresh
- add r12, r2, r1
- add r1, r1, r1
-
- vdup.u8 q2, r3 ; duplicate thresh
-
- vld1.u8 {q3}, [r2@128], r1 ; p3
- vld1.u8 {q4}, [r12@128], r1 ; p2
- vld1.u8 {q5}, [r2@128], r1 ; p1
- vld1.u8 {q6}, [r12@128], r1 ; p0
- vld1.u8 {q7}, [r2@128], r1 ; q0
- vld1.u8 {q8}, [r12@128], r1 ; q1
- vld1.u8 {q9}, [r2@128] ; q2
- vld1.u8 {q10}, [r12@128] ; q3
-
- sub r2, r2, r1, lsl #1
- sub r12, r12, r1, lsl #1
-
- bl vp8_loop_filter_neon
-
- vst1.u8 {q5}, [r2@128], r1 ; store op1
- vst1.u8 {q6}, [r12@128], r1 ; store op0
- vst1.u8 {q7}, [r2@128], r1 ; store oq0
- vst1.u8 {q8}, [r12@128], r1 ; store oq1
-
- vpop {d8-d15}
- pop {pc}
- ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
-
-
-; r0 unsigned char *u,
-; r1 int pitch,
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-; sp+4 unsigned char *v
-|vp8_loop_filter_horizontal_edge_uv_neon| PROC
- push {lr}
- vpush {d8-d15}
-
- vdup.u8 q0, r2 ; duplicate blimit
- vdup.u8 q1, r3 ; duplicate limit
- ldr r12, [sp, #68] ; load thresh
- ldr r2, [sp, #72] ; load v ptr
- vdup.u8 q2, r12 ; duplicate thresh
-
- sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines
- sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines
-
- vld1.u8 {d6}, [r3@64], r1 ; p3
- vld1.u8 {d7}, [r12@64], r1 ; p3
- vld1.u8 {d8}, [r3@64], r1 ; p2
- vld1.u8 {d9}, [r12@64], r1 ; p2
- vld1.u8 {d10}, [r3@64], r1 ; p1
- vld1.u8 {d11}, [r12@64], r1 ; p1
- vld1.u8 {d12}, [r3@64], r1 ; p0
- vld1.u8 {d13}, [r12@64], r1 ; p0
- vld1.u8 {d14}, [r3@64], r1 ; q0
- vld1.u8 {d15}, [r12@64], r1 ; q0
- vld1.u8 {d16}, [r3@64], r1 ; q1
- vld1.u8 {d17}, [r12@64], r1 ; q1
- vld1.u8 {d18}, [r3@64], r1 ; q2
- vld1.u8 {d19}, [r12@64], r1 ; q2
- vld1.u8 {d20}, [r3@64] ; q3
- vld1.u8 {d21}, [r12@64] ; q3
-
- bl vp8_loop_filter_neon
-
- sub r0, r0, r1, lsl #1
- sub r2, r2, r1, lsl #1
-
- vst1.u8 {d10}, [r0@64], r1 ; store u op1
- vst1.u8 {d11}, [r2@64], r1 ; store v op1
- vst1.u8 {d12}, [r0@64], r1 ; store u op0
- vst1.u8 {d13}, [r2@64], r1 ; store v op0
- vst1.u8 {d14}, [r0@64], r1 ; store u oq0
- vst1.u8 {d15}, [r2@64], r1 ; store v oq0
- vst1.u8 {d16}, [r0@64] ; store u oq1
- vst1.u8 {d17}, [r2@64] ; store v oq1
-
- vpop {d8-d15}
- pop {pc}
- ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon|
-
-; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
-; const signed char *flimit,
-; const signed char *limit,
-; const signed char *thresh,
-; int count)
-; r0 unsigned char *src
-; r1 int pitch
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-
-|vp8_loop_filter_vertical_edge_y_neon| PROC
- push {lr}
- vpush {d8-d15}
-
- vdup.u8 q0, r2 ; duplicate blimit
- vdup.u8 q1, r3 ; duplicate limit
- sub r2, r0, #4 ; src ptr down by 4 columns
- add r1, r1, r1
- ldr r3, [sp, #68] ; load thresh
- add r12, r2, r1, asr #1
-
- vld1.u8 {d6}, [r2], r1
- vld1.u8 {d8}, [r12], r1
- vld1.u8 {d10}, [r2], r1
- vld1.u8 {d12}, [r12], r1
- vld1.u8 {d14}, [r2], r1
- vld1.u8 {d16}, [r12], r1
- vld1.u8 {d18}, [r2], r1
- vld1.u8 {d20}, [r12], r1
-
- vld1.u8 {d7}, [r2], r1 ; load second 8-line src data
- vld1.u8 {d9}, [r12], r1
- vld1.u8 {d11}, [r2], r1
- vld1.u8 {d13}, [r12], r1
- vld1.u8 {d15}, [r2], r1
- vld1.u8 {d17}, [r12], r1
- vld1.u8 {d19}, [r2]
- vld1.u8 {d21}, [r12]
-
- ;transpose to 8x16 matrix
- vtrn.32 q3, q7
- vtrn.32 q4, q8
- vtrn.32 q5, q9
- vtrn.32 q6, q10
-
- vdup.u8 q2, r3 ; duplicate thresh
-
- vtrn.16 q3, q5
- vtrn.16 q4, q6
- vtrn.16 q7, q9
- vtrn.16 q8, q10
-
- vtrn.8 q3, q4
- vtrn.8 q5, q6
- vtrn.8 q7, q8
- vtrn.8 q9, q10
-
- bl vp8_loop_filter_neon
-
- vswp d12, d11
- vswp d16, d13
-
- sub r0, r0, #2 ; dst ptr
-
- vswp d14, d12
- vswp d16, d15
-
- add r12, r0, r1, asr #1
-
- ;store op1, op0, oq0, oq1
- vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
- vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
- vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
- vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
- vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
- vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
- vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
- vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
-
- vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
- vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
- vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
- vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
- vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
- vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
- vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0]
- vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12]
-
- vpop {d8-d15}
- pop {pc}
- ENDP ; |vp8_loop_filter_vertical_edge_y_neon|
-
-; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
-; const signed char *flimit,
-; const signed char *limit,
-; const signed char *thresh,
-; unsigned char *v)
-; r0 unsigned char *u,
-; r1 int pitch,
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-; sp+4 unsigned char *v
-|vp8_loop_filter_vertical_edge_uv_neon| PROC
- push {lr}
- vpush {d8-d15}
-
- vdup.u8 q0, r2 ; duplicate blimit
- sub r12, r0, #4 ; move u pointer down by 4 columns
- ldr r2, [sp, #72] ; load v ptr
- vdup.u8 q1, r3 ; duplicate limit
- sub r3, r2, #4 ; move v pointer down by 4 columns
-
- vld1.u8 {d6}, [r12], r1 ;load u data
- vld1.u8 {d7}, [r3], r1 ;load v data
- vld1.u8 {d8}, [r12], r1
- vld1.u8 {d9}, [r3], r1
- vld1.u8 {d10}, [r12], r1
- vld1.u8 {d11}, [r3], r1
- vld1.u8 {d12}, [r12], r1
- vld1.u8 {d13}, [r3], r1
- vld1.u8 {d14}, [r12], r1
- vld1.u8 {d15}, [r3], r1
- vld1.u8 {d16}, [r12], r1
- vld1.u8 {d17}, [r3], r1
- vld1.u8 {d18}, [r12], r1
- vld1.u8 {d19}, [r3], r1
- vld1.u8 {d20}, [r12]
- vld1.u8 {d21}, [r3]
-
- ldr r12, [sp, #68] ; load thresh
-
- ;transpose to 8x16 matrix
- vtrn.32 q3, q7
- vtrn.32 q4, q8
- vtrn.32 q5, q9
- vtrn.32 q6, q10
-
- vdup.u8 q2, r12 ; duplicate thresh
-
- vtrn.16 q3, q5
- vtrn.16 q4, q6
- vtrn.16 q7, q9
- vtrn.16 q8, q10
-
- vtrn.8 q3, q4
- vtrn.8 q5, q6
- vtrn.8 q7, q8
- vtrn.8 q9, q10
-
- bl vp8_loop_filter_neon
-
- vswp d12, d11
- vswp d16, d13
- vswp d14, d12
- vswp d16, d15
-
- sub r0, r0, #2
- sub r2, r2, #2
-
- ;store op1, op0, oq0, oq1
- vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
- vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
- vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
- vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
- vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
- vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
- vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
- vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
- vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
- vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
- vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
- vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
- vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
- vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
- vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0]
- vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2]
-
- vpop {d8-d15}
- pop {pc}
- ENDP ; |vp8_loop_filter_vertical_edge_uv_neon|
-
-; void vp8_loop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store.
-
-; r0-r3 PRESERVE
-; q0 flimit
-; q1 limit
-; q2 thresh
-; q3 p3
-; q4 p2
-; q5 p1
-; q6 p0
-; q7 q0
-; q8 q1
-; q9 q2
-; q10 q3
-|vp8_loop_filter_neon| PROC
-
- ; vp8_filter_mask
- vabd.u8 q11, q3, q4 ; abs(p3 - p2)
- vabd.u8 q12, q4, q5 ; abs(p2 - p1)
- vabd.u8 q13, q5, q6 ; abs(p1 - p0)
- vabd.u8 q14, q8, q7 ; abs(q1 - q0)
- vabd.u8 q3, q9, q8 ; abs(q2 - q1)
- vabd.u8 q4, q10, q9 ; abs(q3 - q2)
-
- vmax.u8 q11, q11, q12
- vmax.u8 q12, q13, q14
- vmax.u8 q3, q3, q4
- vmax.u8 q15, q11, q12
-
- vabd.u8 q9, q6, q7 ; abs(p0 - q0)
-
- ; vp8_hevmask
- vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
- vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
- vmax.u8 q15, q15, q3
-
- vmov.u8 q10, #0x80 ; 0x80
-
- vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
- vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
-
- vcge.u8 q15, q1, q15
-
- ; vp8_filter() function
- ; convert to signed
- veor q7, q7, q10 ; qs0
- vshr.u8 q2, q2, #1 ; a = a / 2
- veor q6, q6, q10 ; ps0
-
- veor q5, q5, q10 ; ps1
- vqadd.u8 q9, q9, q2 ; a = b + a
-
- veor q8, q8, q10 ; qs1
-
- vmov.u8 q10, #3 ; #3
-
- vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
- vsubl.s8 q11, d15, d13
-
- vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
-
- vmovl.u8 q4, d20
-
- vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
- vorr q14, q13, q14 ; vp8_hevmask
-
- vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0)
- vmul.i16 q11, q11, q4
-
- vand q1, q1, q14 ; vp8_filter &= hev
- vand q15, q15, q9 ; vp8_filter_mask
-
- vaddw.s8 q2, q2, d2
- vaddw.s8 q11, q11, d3
-
- vmov.u8 q9, #4 ; #4
-
- ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
- vqmovn.s16 d2, q2
- vqmovn.s16 d3, q11
- vand q1, q1, q15 ; vp8_filter &= mask
-
- vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp8_filter+3)
- vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp8_filter+4)
- vshr.s8 q2, q2, #3 ; Filter2 >>= 3
- vshr.s8 q1, q1, #3 ; Filter1 >>= 3
-
-
- vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2)
- vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1)
-
- ; outer tap adjustments: ++vp8_filter >> 1
- vrshr.s8 q1, q1, #1
- vbic q1, q1, q14 ; vp8_filter &= ~hev
- vmov.u8 q0, #0x80 ; 0x80
- vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter)
- vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter)
-
- veor q6, q11, q0 ; *op0 = u^0x80
- veor q7, q10, q0 ; *oq0 = u^0x80
- veor q5, q13, q0 ; *op1 = u^0x80
- veor q8, q12, q0 ; *oq1 = u^0x80
-
- bx lr
- ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
-
-;-----------------
-
- END
diff --git a/vp8/common/arm/neon/loopfilter_neon.c b/vp8/common/arm/neon/loopfilter_neon.c
new file mode 100644
index 0000000..0bec7fb
--- /dev/null
+++ b/vp8/common/arm/neon/loopfilter_neon.c
@@ -0,0 +1,549 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+
+static INLINE void vp8_loop_filter_neon(
+ uint8x16_t qblimit, // flimit
+ uint8x16_t qlimit, // limit
+ uint8x16_t qthresh, // thresh
+ uint8x16_t q3, // p3
+ uint8x16_t q4, // p2
+ uint8x16_t q5, // p1
+ uint8x16_t q6, // p0
+ uint8x16_t q7, // q0
+ uint8x16_t q8, // q1
+ uint8x16_t q9, // q2
+ uint8x16_t q10, // q3
+ uint8x16_t *q5r, // p1
+ uint8x16_t *q6r, // p0
+ uint8x16_t *q7r, // q0
+ uint8x16_t *q8r) { // q1
+ uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int16x8_t q2s16, q11s16;
+ uint16x8_t q4u16;
+ int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
+ int8x8_t d2s8, d3s8;
+
+ q11u8 = vabdq_u8(q3, q4);
+ q12u8 = vabdq_u8(q4, q5);
+ q13u8 = vabdq_u8(q5, q6);
+ q14u8 = vabdq_u8(q8, q7);
+ q3 = vabdq_u8(q9, q8);
+ q4 = vabdq_u8(q10, q9);
+
+ q11u8 = vmaxq_u8(q11u8, q12u8);
+ q12u8 = vmaxq_u8(q13u8, q14u8);
+ q3 = vmaxq_u8(q3, q4);
+ q15u8 = vmaxq_u8(q11u8, q12u8);
+
+ q9 = vabdq_u8(q6, q7);
+
+ // vp8_hevmask
+ q13u8 = vcgtq_u8(q13u8, qthresh);
+ q14u8 = vcgtq_u8(q14u8, qthresh);
+ q15u8 = vmaxq_u8(q15u8, q3);
+
+ q2u8 = vabdq_u8(q5, q8);
+ q9 = vqaddq_u8(q9, q9);
+
+ q15u8 = vcgeq_u8(qlimit, q15u8);
+
+ // vp8_filter() function
+ // convert to signed
+ q10 = vdupq_n_u8(0x80);
+ q8 = veorq_u8(q8, q10);
+ q7 = veorq_u8(q7, q10);
+ q6 = veorq_u8(q6, q10);
+ q5 = veorq_u8(q5, q10);
+
+ q2u8 = vshrq_n_u8(q2u8, 1);
+ q9 = vqaddq_u8(q9, q2u8);
+
+ q10 = vdupq_n_u8(3);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+ vget_low_s8(vreinterpretq_s8_u8(q6)));
+ q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+ vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+ q9 = vcgeq_u8(qblimit, q9);
+
+ q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+ vreinterpretq_s8_u8(q8));
+
+ q14u8 = vorrq_u8(q13u8, q14u8);
+
+ q4u16 = vmovl_u8(vget_low_u8(q10));
+ q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+ q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+ q15u8 = vandq_u8(q15u8, q9);
+
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+ q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+ q9 = vdupq_n_u8(4);
+ // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+ d2s8 = vqmovn_s16(q2s16);
+ d3s8 = vqmovn_s16(q11s16);
+ q1s8 = vcombine_s8(d2s8, d3s8);
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+
+ q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
+ q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q1s8 = vshrq_n_s8(q1s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+ q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+ q1s8 = vrshrq_n_s8(q1s8, 1);
+ q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+ q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+ q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+ q0u8 = vdupq_n_u8(0x80);
+ *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
+ *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+ *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+ *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
+ return;
+}
+
+void vp8_loop_filter_horizontal_edge_y_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+ src -= (pitch << 2);
+
+ q3 = vld1q_u8(src);
+ src += pitch;
+ q4 = vld1q_u8(src);
+ src += pitch;
+ q5 = vld1q_u8(src);
+ src += pitch;
+ q6 = vld1q_u8(src);
+ src += pitch;
+ q7 = vld1q_u8(src);
+ src += pitch;
+ q8 = vld1q_u8(src);
+ src += pitch;
+ q9 = vld1q_u8(src);
+ src += pitch;
+ q10 = vld1q_u8(src);
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ src -= (pitch * 5);
+ vst1q_u8(src, q5);
+ src += pitch;
+ vst1q_u8(src, q6);
+ src += pitch;
+ vst1q_u8(src, q7);
+ src += pitch;
+ vst1q_u8(src, q8);
+ return;
+}
+
+void vp8_loop_filter_horizontal_edge_uv_neon(
+ unsigned char *u,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ u -= (pitch << 2);
+ v -= (pitch << 2);
+
+ d6 = vld1_u8(u);
+ u += pitch;
+ d7 = vld1_u8(v);
+ v += pitch;
+ d8 = vld1_u8(u);
+ u += pitch;
+ d9 = vld1_u8(v);
+ v += pitch;
+ d10 = vld1_u8(u);
+ u += pitch;
+ d11 = vld1_u8(v);
+ v += pitch;
+ d12 = vld1_u8(u);
+ u += pitch;
+ d13 = vld1_u8(v);
+ v += pitch;
+ d14 = vld1_u8(u);
+ u += pitch;
+ d15 = vld1_u8(v);
+ v += pitch;
+ d16 = vld1_u8(u);
+ u += pitch;
+ d17 = vld1_u8(v);
+ v += pitch;
+ d18 = vld1_u8(u);
+ u += pitch;
+ d19 = vld1_u8(v);
+ v += pitch;
+ d20 = vld1_u8(u);
+ d21 = vld1_u8(v);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ u -= (pitch * 5);
+ vst1_u8(u, vget_low_u8(q5));
+ u += pitch;
+ vst1_u8(u, vget_low_u8(q6));
+ u += pitch;
+ vst1_u8(u, vget_low_u8(q7));
+ u += pitch;
+ vst1_u8(u, vget_low_u8(q8));
+
+ v -= (pitch * 5);
+ vst1_u8(v, vget_high_u8(q5));
+ v += pitch;
+ vst1_u8(v, vget_high_u8(q6));
+ v += pitch;
+ vst1_u8(v, vget_high_u8(q7));
+ v += pitch;
+ vst1_u8(v, vget_high_u8(q8));
+ return;
+}
+
+static INLINE void write_4x8(unsigned char *dst, int pitch,
+ const uint8x8x4_t result) {
+#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+ vst4_lane_u8(dst, result, 0);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 1);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 2);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 3);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 4);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 5);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 6);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 7);
+#else
+ /*
+ * uint8x8x4_t result
+ 00 01 02 03 | 04 05 06 07
+ 10 11 12 13 | 14 15 16 17
+ 20 21 22 23 | 24 25 26 27
+ 30 31 32 33 | 34 35 36 37
+ ---
+ * after vtrn_u16
+ 00 01 20 21 | 04 05 24 25
+ 02 03 22 23 | 06 07 26 27
+ 10 11 30 31 | 14 15 34 35
+ 12 13 32 33 | 16 17 36 37
+ ---
+ * after vtrn_u8
+ 00 10 20 30 | 04 14 24 34
+ 01 11 21 31 | 05 15 25 35
+ 02 12 22 32 | 06 16 26 36
+ 03 13 23 33 | 07 17 27 37
+ */
+ const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
+ vreinterpret_u16_u8(result.val[2]));
+ const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
+ vreinterpret_u16_u8(result.val[3]));
+ const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
+ vreinterpret_u8_u16(r13_u16.val[0]));
+ const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
+ vreinterpret_u8_u16(r13_u16.val[1]));
+ const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
+ const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
+ const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
+ const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
+ vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
+#endif
+}
+
+void vp8_loop_filter_vertical_edge_y_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ unsigned char *s, *d;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+ uint8x8x4_t q4ResultH, q4ResultL;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ s = src - 4;
+ d6 = vld1_u8(s);
+ s += pitch;
+ d8 = vld1_u8(s);
+ s += pitch;
+ d10 = vld1_u8(s);
+ s += pitch;
+ d12 = vld1_u8(s);
+ s += pitch;
+ d14 = vld1_u8(s);
+ s += pitch;
+ d16 = vld1_u8(s);
+ s += pitch;
+ d18 = vld1_u8(s);
+ s += pitch;
+ d20 = vld1_u8(s);
+ s += pitch;
+ d7 = vld1_u8(s);
+ s += pitch;
+ d9 = vld1_u8(s);
+ s += pitch;
+ d11 = vld1_u8(s);
+ s += pitch;
+ d13 = vld1_u8(s);
+ s += pitch;
+ d15 = vld1_u8(s);
+ s += pitch;
+ d17 = vld1_u8(s);
+ s += pitch;
+ d19 = vld1_u8(s);
+ s += pitch;
+ d21 = vld1_u8(s);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ q4ResultL.val[0] = vget_low_u8(q5); // d10
+ q4ResultL.val[1] = vget_low_u8(q6); // d12
+ q4ResultL.val[2] = vget_low_u8(q7); // d14
+ q4ResultL.val[3] = vget_low_u8(q8); // d16
+ q4ResultH.val[0] = vget_high_u8(q5); // d11
+ q4ResultH.val[1] = vget_high_u8(q6); // d13
+ q4ResultH.val[2] = vget_high_u8(q7); // d15
+ q4ResultH.val[3] = vget_high_u8(q8); // d17
+
+ d = src - 2;
+ write_4x8(d, pitch, q4ResultL);
+ d += pitch * 8;
+ write_4x8(d, pitch, q4ResultH);
+}
+
+void vp8_loop_filter_vertical_edge_uv_neon(
+ unsigned char *u,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ unsigned char *us, *ud;
+ unsigned char *vs, *vd;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+ uint8x8x4_t q4ResultH, q4ResultL;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ us = u - 4;
+ d6 = vld1_u8(us);
+ us += pitch;
+ d8 = vld1_u8(us);
+ us += pitch;
+ d10 = vld1_u8(us);
+ us += pitch;
+ d12 = vld1_u8(us);
+ us += pitch;
+ d14 = vld1_u8(us);
+ us += pitch;
+ d16 = vld1_u8(us);
+ us += pitch;
+ d18 = vld1_u8(us);
+ us += pitch;
+ d20 = vld1_u8(us);
+
+ vs = v - 4;
+ d7 = vld1_u8(vs);
+ vs += pitch;
+ d9 = vld1_u8(vs);
+ vs += pitch;
+ d11 = vld1_u8(vs);
+ vs += pitch;
+ d13 = vld1_u8(vs);
+ vs += pitch;
+ d15 = vld1_u8(vs);
+ vs += pitch;
+ d17 = vld1_u8(vs);
+ vs += pitch;
+ d19 = vld1_u8(vs);
+ vs += pitch;
+ d21 = vld1_u8(vs);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ q4ResultL.val[0] = vget_low_u8(q5); // d10
+ q4ResultL.val[1] = vget_low_u8(q6); // d12
+ q4ResultL.val[2] = vget_low_u8(q7); // d14
+ q4ResultL.val[3] = vget_low_u8(q8); // d16
+ ud = u - 2;
+ write_4x8(ud, pitch, q4ResultL);
+
+ q4ResultH.val[0] = vget_high_u8(q5); // d11
+ q4ResultH.val[1] = vget_high_u8(q6); // d13
+ q4ResultH.val[2] = vget_high_u8(q7); // d15
+ q4ResultH.val[3] = vget_high_u8(q8); // d17
+ vd = v - 2;
+ write_4x8(vd, pitch, q4ResultH);
+}
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 902a83a..5e245ce 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -58,9 +58,8 @@
$vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2;
add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bv mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_loop_filter_bv mmx sse2 media neon dspr2/;
$vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6;
-$vp8_loop_filter_bv_neon_asm=vp8_loop_filter_bv_neon;
$vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2;
add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
@@ -69,9 +68,8 @@
$vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2;
add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bh mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_loop_filter_bh mmx sse2 media neon dspr2/;
$vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6;
-$vp8_loop_filter_bh_neon_asm=vp8_loop_filter_bh_neon;
$vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2;
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index 2da0d8c..0c98eb1 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -413,9 +413,11 @@
denoiser->nmse_source_diff = 0;
denoiser->nmse_source_diff_count = 0;
// TODO(marpan): Adjust thresholds, including effect on resolution.
- denoiser->threshold_aggressive_mode = 40;
+ denoiser->threshold_aggressive_mode = 35;
if (width * height > 640 * 480)
- denoiser->threshold_aggressive_mode = 180;
+ denoiser->threshold_aggressive_mode = 150;
+ else if (width * height > 1280 * 720)
+ denoiser->threshold_aggressive_mode = 1400;
return 0;
}
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 8dd1881..43f8957 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -487,6 +487,7 @@
MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
int_mv mv = x->e_mbd.mode_info_context->mbmi.mv;
int this_rd;
+ int denoise_aggressive = 0;
/* Exit early and don't compute the distortion if this macroblock
* is marked inactive. */
if (cpi->active_map_enabled && x->active_ptr[0] == 0)
@@ -505,10 +506,17 @@
this_rd = RDCOST(x->rdmult, x->rddiv, rate2, *distortion2);
+#if CONFIG_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0) {
+ denoise_aggressive =
+ (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) ? 1 : 0;
+ }
+#endif
+
// Adjust rd for ZEROMV and LAST, if LAST is the closest reference frame.
if (this_mode == ZEROMV &&
x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME &&
- cpi->closest_reference_frame == LAST_FRAME)
+ (denoise_aggressive || cpi->closest_reference_frame == LAST_FRAME))
{
this_rd = ((int64_t)this_rd) * rd_adj / 100;
}
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 6db031f..94c9506 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -157,7 +157,6 @@
# common (neon)
#VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/reconintra_arm.c
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/loopfilter_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
#VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/idct_blk_neon.c
@@ -174,6 +173,7 @@
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequant_idct_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequantizeb_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfilter_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad_neon.c
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 46f463a..aef20f2 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -112,9 +112,9 @@
int quant_fp;
// skip forward transform and quantization
- int skip_txfm[MAX_MB_PLANE];
+ int skip_txfm[MAX_MB_PLANE << 2];
- int64_t bsse[MAX_MB_PLANE];
+ int64_t bsse[MAX_MB_PLANE << 2];
// Used to store sub partition's choices.
MV pred_mv[MAX_REF_FRAMES];
diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h
index 0cbb244..fccdaf5 100644
--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@@ -33,10 +33,10 @@
int is_coded;
int num_4x4_blk;
int skip;
- int skip_txfm[MAX_MB_PLANE];
// For current partition, only if all Y, U, and V transform blocks'
// coefficients are quantized to 0, skippable is set to 0.
int skippable;
+ int skip_txfm[MAX_MB_PLANE << 2];
int best_mode_index;
int hybrid_pred_diff;
int comp_pred_diff;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 8a737e1..6678450 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -476,20 +476,24 @@
}
if (!x->skip_recode) {
- if (x->skip_txfm[plane] == 0) {
- // full forward transform and quantization
- if (x->quant_fp)
- vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
- else
- vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
- } else if (x->skip_txfm[plane] == 2) {
- // fast path forward transform and quantization
- vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+ if (max_txsize_lookup[plane_bsize] == tx_size) {
+ if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) {
+ // full forward transform and quantization
+ if (x->quant_fp)
+ vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
+ else
+ vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+ } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) {
+ // fast path forward transform and quantization
+ vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+ } else {
+ // skip forward transform
+ p->eobs[block] = 0;
+ *a = *l = 0;
+ return;
+ }
} else {
- // skip forward transform
- p->eobs[block] = 0;
- *a = *l = 0;
- return;
+ vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
}
}
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 9ad6db0..9d42a12 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -216,7 +216,7 @@
// If auto_mv_step_size is enabled then keep track of the largest
// motion vector component used.
- if (!cpi->dummy_packing && cpi->sf.mv.auto_mv_step_size) {
+ if (cpi->sf.mv.auto_mv_step_size) {
unsigned int maxv = MAX(abs(mv->row), abs(mv->col)) >> 3;
cpi->max_mv_magnitude = MAX(maxv, cpi->max_mv_magnitude);
}
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 4d97053..8a74aec 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1802,7 +1802,6 @@
// to recode.
if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
save_coding_context(cpi);
- cpi->dummy_packing = 1;
if (!cpi->sf.use_nonrd_pick_mode)
vp9_pack_bitstream(cpi, dest, size);
@@ -2272,7 +2271,6 @@
loopfilter_frame(cpi, cm);
// build the bitstream
- cpi->dummy_packing = 0;
vp9_pack_bitstream(cpi, dest, size);
if (cm->seg.update_map)
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index b152f08..9730831 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -332,7 +332,7 @@
TWO_PASS twopass;
YV12_BUFFER_CONFIG alt_ref_buffer;
- YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+
#if CONFIG_INTERNAL_STATS
unsigned int mode_chosen_counts[MAX_MODES];
@@ -371,8 +371,6 @@
int droppable;
- int dummy_packing; /* flag to indicate if packing is dummy */
-
int initial_width;
int initial_height;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 7867dc2..c2c2d28 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -76,16 +76,6 @@
p->stats_in = position;
}
-static int lookup_next_frame_stats(const TWO_PASS *p,
- FIRSTPASS_STATS *next_frame) {
- if (p->stats_in >= p->stats_in_end)
- return EOF;
-
- *next_frame = *p->stats_in;
- return 1;
-}
-
-
// Read frame stats at an offset from the current position.
static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) ||
@@ -1871,16 +1861,17 @@
input_stats(twopass, this_frame);
// Provided that we are not at the end of the file...
- if (cpi->oxcf.auto_key &&
- lookup_next_frame_stats(twopass, &next_frame) != EOF) {
+ if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) {
double loop_decay_rate;
// Check for a scene cut.
- if (test_candidate_kf(twopass, &last_frame, this_frame, &next_frame))
+ if (test_candidate_kf(twopass, &last_frame, this_frame,
+ twopass->stats_in))
break;
// How fast is the prediction quality decaying?
- loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
+ loop_decay_rate = get_prediction_decay_rate(&cpi->common,
+ twopass->stats_in);
// We want to know something about the recent past... rather than
// as used elsewhere where we are concerned with decay in prediction
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f73006e..7aaa903 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -172,6 +172,7 @@
const int ref = xd->mi[0]->mbmi.ref_frame[0];
unsigned int sse;
unsigned int var = 0;
+ unsigned int sum_sse = 0;
const int shift = 8;
int rate;
int64_t dist;
@@ -190,31 +191,33 @@
int lw = b_width_log2_lookup[unit_size] + 2;
int lh = b_height_log2_lookup[unit_size] + 2;
- x->bsse[i] = 0;
+ sum_sse = 0;
for (idy = 0; idy < bh; ++idy) {
for (idx = 0; idx < bw; ++idx) {
uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
uint8_t *dst = pd->dst.buf + (idy * pd->dst.stride << lh) + (idx << lh);
+ int block_idx = (idy << 1) + idx;
- var += cpi->fn_ptr[unit_size].vf(src , p->src.stride,
- dst, pd->dst.stride, &sse);
+ var = cpi->fn_ptr[unit_size].vf(src, p->src.stride,
+ dst, pd->dst.stride, &sse);
+ x->bsse[(i << 2) + block_idx] = sse;
+ sum_sse += sse;
- x->bsse[i] += sse;
+ if (!x->select_tx_size) {
+ if (x->bsse[(i << 2) + block_idx] < p->quant_thred[0] >> shift)
+ x->skip_txfm[(i << 2) + block_idx] = 1;
+ else if (var < p->quant_thred[1] >> shift)
+ x->skip_txfm[(i << 2) + block_idx] = 2;
+ else
+ x->skip_txfm[(i << 2) + block_idx] = 0;
+ }
+
if (i == 0)
x->pred_sse[ref] += sse;
}
}
- if (!x->select_tx_size) {
- if (x->bsse[i] < p->quant_thred[0] >> shift)
- x->skip_txfm[i] = 1;
- else if (var < p->quant_thred[1] >> shift)
- x->skip_txfm[i] = 2;
- else
- x->skip_txfm[i] = 0;
- }
-
// Fast approximate the modelling function.
if (cpi->oxcf.speed > 4) {
int64_t rate;
@@ -230,7 +233,7 @@
rate_sum += rate;
dist_sum += dist;
} else {
- vp9_model_rd_from_var_lapndz(x->bsse[i], 1 << num_pels_log2_lookup[bs],
+ vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
pd->dequant[1] >> 3, &rate, &dist);
rate_sum += rate;
dist_sum += dist;
@@ -390,17 +393,17 @@
if (!is_inter_block(mbmi)) {
vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
dist_block(plane, block, tx_size, args);
- } else {
- if (x->skip_txfm[plane] == 0) {
+ } else if (max_txsize_lookup[plane_bsize] == tx_size) {
+ if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) {
// full forward transform and quantization
vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
dist_block(plane, block, tx_size, args);
- } else if (x->skip_txfm[plane] == 2) {
+ } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) {
// compute DC coefficient
int16_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
int16_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
- args->sse = x->bsse[plane] << 4;
+ args->sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
args->dist = args->sse;
if (!x->plane[plane].eobs[block])
args->dist = args->sse - ((coeff[0] * coeff[0] -
@@ -408,9 +411,13 @@
} else {
// skip forward transform
x->plane[plane].eobs[block] = 0;
- args->sse = x->bsse[plane] << 4;
+ args->sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
args->dist = args->sse;
}
+ } else {
+ // full forward transform and quantization
+ vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+ dist_block(plane, block, tx_size, args);
}
rate_block(plane, block, plane_bsize, tx_size, args);
@@ -2166,8 +2173,8 @@
int orig_dst_stride[MAX_MB_PLANE];
int rs = 0;
INTERP_FILTER best_filter = SWITCHABLE;
- int skip_txfm[MAX_MB_PLANE] = {0};
- int64_t bsse[MAX_MB_PLANE] = {0};
+ int skip_txfm[MAX_MB_PLANE << 2] = {0};
+ int64_t bsse[MAX_MB_PLANE << 2] = {0};
int bsl = mi_width_log2_lookup[bsize];
int pred_filter_search = cpi->sf.cb_pred_filter_search ?
@@ -2772,6 +2779,10 @@
comp_pred = second_ref_frame > INTRA_FRAME;
if (comp_pred) {
+ if (cpi->sf.alt_ref_search_fp)
+ if (!cm->show_frame)
+ continue;
+
if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
best_mode_index >=0 &&
vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
@@ -2787,6 +2798,10 @@
}
if (ref_frame == INTRA_FRAME) {
+ if (cpi->sf.adaptive_mode_search)
+ if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_intra_rd)
+ continue;
+
if (!(intra_y_mode_mask & (1 << this_mode)))
continue;
if (this_mode != DC_PRED) {
@@ -2959,6 +2974,8 @@
/* required for left and above block mv */
mbmi->mv[0].as_int = 0;
max_plane = 1;
+ } else {
+ best_intra_rd = x->pred_sse[ref_frame];
}
*returnrate = rate2;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 0bead48..36c02e0 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -144,6 +144,7 @@
sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
}
sf->adaptive_pred_interp_filter = 0;
+ sf->adaptive_mode_search = 1;
sf->cb_partition_search = !boosted;
sf->cb_pred_filter_search = 1;
sf->alt_ref_search_fp = 1;
@@ -386,6 +387,7 @@
sf->use_lp32x32fdct = 0;
sf->adaptive_motion_search = 0;
sf->adaptive_pred_interp_filter = 0;
+ sf->adaptive_mode_search = 0;
sf->cb_pred_filter_search = 0;
sf->cb_partition_search = 0;
sf->motion_field_mode_search = 0;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index e31185e..33c441f 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -291,6 +291,9 @@
// was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
int adaptive_pred_interp_filter;
+ // Adaptive prediction mode search
+ int adaptive_mode_search;
+
// Chessboard pattern prediction filter type search
int cb_pred_filter_search;
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 0d95ed1..cc901b5 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -191,6 +191,7 @@
}
static void temporal_filter_iterate_c(VP9_COMP *cpi,
+ YV12_BUFFER_CONFIG **frames,
int frame_count,
int alt_ref_index,
int strength,
@@ -206,7 +207,7 @@
DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 * 3);
DECLARE_ALIGNED_ARRAY(16, uint16_t, count, 16 * 16 * 3);
MACROBLOCKD *mbd = &cpi->mb.e_mbd;
- YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
+ YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
uint8_t *dst1, *dst2;
DECLARE_ALIGNED_ARRAY(16, uint8_t, predictor, 16 * 16 * 3);
const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
@@ -250,7 +251,7 @@
const int thresh_low = 10000;
const int thresh_high = 20000;
- if (cpi->frames[frame] == NULL)
+ if (frames[frame] == NULL)
continue;
mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0;
@@ -261,9 +262,9 @@
} else {
// Find best match in this frame by MC
int err = temporal_filter_find_matching_mb_c(cpi,
- cpi->frames[alt_ref_index]->y_buffer + mb_y_offset,
- cpi->frames[frame]->y_buffer + mb_y_offset,
- cpi->frames[frame]->y_stride);
+ frames[alt_ref_index]->y_buffer + mb_y_offset,
+ frames[frame]->y_buffer + mb_y_offset,
+ frames[frame]->y_stride);
// Assign higher weight to matching MB if it's error
// score is lower. If not applying MC default behavior
@@ -275,10 +276,10 @@
if (filter_weight != 0) {
// Construct the predictors
temporal_filter_predictors_mb_c(mbd,
- cpi->frames[frame]->y_buffer + mb_y_offset,
- cpi->frames[frame]->u_buffer + mb_uv_offset,
- cpi->frames[frame]->v_buffer + mb_uv_offset,
- cpi->frames[frame]->y_stride,
+ frames[frame]->y_buffer + mb_y_offset,
+ frames[frame]->u_buffer + mb_uv_offset,
+ frames[frame]->v_buffer + mb_uv_offset,
+ frames[frame]->y_stride,
mb_uv_width, mb_uv_height,
mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
mbd->mi[0]->bmi[0].as_mv[0].as_mv.col,
@@ -432,6 +433,7 @@
int frames_to_blur_backward;
int frames_to_blur_forward;
struct scale_factors sf;
+ YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = {NULL};
// Apply context specific adjustments to the arnr filter parameters.
adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength);
@@ -440,12 +442,11 @@
start_frame = distance + frames_to_blur_forward;
// Setup frame pointers, NULL indicates frame not included in filter.
- vp9_zero(cpi->frames);
for (frame = 0; frame < frames_to_blur; ++frame) {
const int which_buffer = start_frame - frame;
struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead,
which_buffer);
- cpi->frames[frames_to_blur - 1 - frame] = &buf->img;
+ frames[frames_to_blur - 1 - frame] = &buf->img;
}
// Setup scaling factors. Scaling on each of the arnr frames is not supported
@@ -460,8 +461,8 @@
get_frame_new_buffer(cm)->y_crop_height);
for (frame = 0; frame < frames_to_blur; ++frame) {
- if (cm->mi_cols * MI_SIZE != cpi->frames[frame]->y_width ||
- cm->mi_rows * MI_SIZE != cpi->frames[frame]->y_height) {
+ if (cm->mi_cols * MI_SIZE != frames[frame]->y_width ||
+ cm->mi_rows * MI_SIZE != frames[frame]->y_height) {
if (vp9_realloc_frame_buffer(&cpi->svc.scaled_frames[frame_used],
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
@@ -473,9 +474,8 @@
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to reallocate alt_ref_buffer");
- cpi->frames[frame] =
- vp9_scale_if_required(cm, cpi->frames[frame],
- &cpi->svc.scaled_frames[frame_used]);
+ frames[frame] = vp9_scale_if_required(cm, frames[frame],
+ &cpi->svc.scaled_frames[frame_used]);
++frame_used;
}
}
@@ -486,6 +486,6 @@
cm->width, cm->height);
}
- temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward,
- strength, &sf);
+ temporal_filter_iterate_c(cpi, frames, frames_to_blur,
+ frames_to_blur_backward, strength, &sf);
}
diff --git a/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/vp9/encoder/x86/vp9_variance_impl_sse2.asm
index 4830412..d74d96f 100644
--- a/vp9/encoder/x86/vp9_variance_impl_sse2.asm
+++ b/vp9/encoder/x86/vp9_variance_impl_sse2.asm
@@ -209,193 +209,3 @@
UNSHADOW_ARGS
pop rbp
ret
-
-
-
-
-;unsigned int vp9_get8x8var_sse2
-;(
-; unsigned char * src_ptr,
-; int source_stride,
-; unsigned char * ref_ptr,
-; int recon_stride,
-; unsigned int * SSE,
-; int * Sum
-;)
-global sym(vp9_get8x8var_sse2) PRIVATE
-sym(vp9_get8x8var_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
- mov rsi, arg(0) ;[src_ptr]
- mov rdi, arg(2) ;[ref_ptr]
-
- movsxd rax, DWORD PTR arg(1) ;[source_stride]
- movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
- pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
-
- movq xmm1, QWORD PTR [rsi]
- movq xmm2, QWORD PTR [rdi]
-
- punpcklbw xmm1, xmm0
- punpcklbw xmm2, xmm0
-
- psubsw xmm1, xmm2
- paddw xmm7, xmm1
-
- pmaddwd xmm1, xmm1
-
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- movq xmm2, QWORD PTR[rsi + rax * 2]
- movq xmm3, QWORD PTR[rdi + rdx * 2]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- lea rsi, [rsi + rax * 2]
- lea rdi, [rdi + rdx * 2]
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
- movq xmm2, QWORD PTR[rsi + rax *2]
- movq xmm3, QWORD PTR[rdi + rdx *2]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- lea rsi, [rsi + rax * 2]
- lea rdi, [rdi + rdx * 2]
-
-
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
- movq xmm2, QWORD PTR[rsi + rax *2]
- movq xmm3, QWORD PTR[rdi + rdx *2]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- lea rsi, [rsi + rax * 2]
- lea rdi, [rdi + rdx * 2]
-
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- movdqa xmm6, xmm7
- punpcklwd xmm6, xmm0
-
- punpckhwd xmm7, xmm0
- movdqa xmm2, xmm1
-
- paddw xmm6, xmm7
- punpckldq xmm1, xmm0
-
- punpckhdq xmm2, xmm0
- movdqa xmm7, xmm6
-
- paddd xmm1, xmm2
- punpckldq xmm6, xmm0
-
- punpckhdq xmm7, xmm0
- paddw xmm6, xmm7
-
- movdqa xmm2, xmm1
- movdqa xmm7, xmm6
-
- psrldq xmm1, 8
- psrldq xmm6, 8
-
- paddw xmm7, xmm6
- paddd xmm1, xmm2
-
- mov rax, arg(5) ;[Sum]
- mov rdi, arg(4) ;[SSE]
-
- movq rdx, xmm7
- movsx rcx, dx
-
- mov dword ptr [rax], ecx
- movd DWORD PTR [rdi], xmm1
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c
index 88e3117..86aa3b1 100644
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@@ -51,9 +51,46 @@
return 0;
}
-unsigned int vp9_get8x8var_sse2(const unsigned char *src, int src_stride,
- const unsigned char *ref, int ref_stride,
- unsigned int *sse, int *sum);
+unsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i vsum = _mm_setzero_si128();
+ __m128i vsse = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 8; i += 2) {
+ const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(src + i * src_stride)), zero);
+ const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(ref + i * ref_stride)), zero);
+ const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+ const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(src + (i + 1) * src_stride)), zero);
+ const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
+ const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+ vsum = _mm_add_epi16(vsum, diff0);
+ vsum = _mm_add_epi16(vsum, diff1);
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+ }
+
+ // sum
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+ // sse
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+ *sse = _mm_cvtsi128_si32(vsse);
+
+ return 0;
+}
unsigned int vp9_get16x16var_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
@@ -110,8 +147,7 @@
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
- sse, &sum, vp9_get8x8var_sse2, 8);
+ vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
return *sse - (((unsigned int)sum * sum) >> 6);
}
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 2230db3..f49eb58 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -803,6 +803,23 @@
return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
}
+static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi,
+ unsigned int lib_flags) {
+ vpx_codec_frame_flags_t flags = lib_flags << 16;
+
+ if (lib_flags & FRAMEFLAGS_KEY
+#if CONFIG_SPATIAL_SVC
+ || (is_spatial_svc(cpi) && cpi->svc.layer_context[0].is_key_frame)
+#endif
+ )
+ flags |= VPX_FRAME_IS_KEY;
+
+ if (cpi->droppable)
+ flags |= VPX_FRAME_IS_DROPPABLE;
+
+ return flags;
+}
+
static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
const vpx_image_t *img,
vpx_codec_pts_t pts,
@@ -933,18 +950,7 @@
pkt.data.frame.duration =
(unsigned long)ticks_to_timebase_units(timebase,
dst_end_time_stamp - dst_time_stamp);
- pkt.data.frame.flags = lib_flags << 16;
-
- if (lib_flags & FRAMEFLAGS_KEY
-#if CONFIG_SPATIAL_SVC
- || (is_spatial_svc(cpi) &&
- cpi->svc.layer_context[0].is_key_frame)
-#endif
- )
- pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
-
- if (cpi->droppable)
- pkt.data.frame.flags |= VPX_FRAME_IS_DROPPABLE;
+ pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
if (ctx->pending_cx_data) {
ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;