Merge "Issue 150: Fixing linker warning in extend.c."
diff --git a/build/make/Makefile b/build/make/Makefile
index 2da8b47..5011ce3 100755
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -39,13 +39,8 @@
@if [ -d "$(DIST_DIR)/src" ]; then \
mkdir -p "$(DIST_DIR)/build"; \
cd "$(DIST_DIR)/build"; \
- if [ "$(TGT_CC)" = "rvct" ] ; then \
- echo "../src/configure --target=$(TOOLCHAIN) --libc=$(ALT_LIBC)"; \
- ../src/configure --target=$(TOOLCHAIN) --libc=$(ALT_LIBC); \
- else \
- echo "../src/configure --target=$(TOOLCHAIN)"; \
- ../src/configure --target=$(TOOLCHAIN); \
- fi; \
+ echo "Rerunning configure $(CONFIGURE_ARGS)"; \
+ ../src/configure $(CONFIGURE_ARGS); \
$(if $(filter vs%,$(TGT_CC)),make NO_LAUNCH_DEVENV=1;) \
fi
@if [ -d "$(DIST_DIR)" ]; then \
@@ -334,6 +329,7 @@
DIST-SRCS-$(CONFIG_MSVS) += build/make/gen_msvs_def.sh
DIST-SRCS-$(CONFIG_MSVS) += build/make/gen_msvs_proj.sh
DIST-SRCS-$(CONFIG_MSVS) += build/make/gen_msvs_sln.sh
+ DIST-SRCS-$(CONFIG_MSVS) += build/x86-msvs/yasm.rules
DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh
#
# This isn't really ARCH_ARM dependent, it's dependant on whether we're
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 35131b0..3b6c919 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -378,7 +378,7 @@
fmt_deps = sed -e 's;^__image.axf;\$(dir \$@)\$(notdir \$<).o \$@;' #hide
EOF
else cat >> $1 << EOF
-fmt_deps = sed -e 's;^\(.*\)\.o;\$(dir \$@)\1\$(suffix \$<).o \$@;' #hide
+fmt_deps = sed -e 's;^\([a-zA-Z0-9_]*\)\.o;\$(dir \$@)\1\$(suffix \$<).o \$@;'
EOF
fi
diff --git a/configure b/configure
index 9be0624..5c908d4 100755
--- a/configure
+++ b/configure
@@ -385,6 +385,7 @@
VERSION_MINOR=${VERSION_MINOR}
VERSION_PATCH=${VERSION_PATCH}
+CONFIGURE_ARGS=${CONFIGURE_ARGS}
EOF
enabled child || echo "CONFIGURE_ARGS?=${CONFIGURE_ARGS}" >> config.mk
diff --git a/libs.mk b/libs.mk
index 4bc5dd7..32f8a34 100644
--- a/libs.mk
+++ b/libs.mk
@@ -11,6 +11,8 @@
ASM:=$(if $(filter yes,$(CONFIG_GCC)),.asm.s,.asm)
+CODEC_SRCS-yes += libs.mk
+
include $(SRC_PATH_BARE)/vpx/vpx_codec.mk
CODEC_SRCS-yes += $(addprefix vpx/,$(call enabled,API_SRCS))
@@ -59,7 +61,6 @@
# This variable uses deferred expansion intentionally, since the results of
# $(wildcard) may change during the course of the Make.
VS_PLATFORMS = $(foreach d,$(wildcard */Release/$(CODEC_LIB).lib),$(word 1,$(subst /, ,$(d))))
-CODEC_SRCS-yes += $(SRC_PATH_BARE)/libs.mk # to show up in the msvs workspace
endif
# The following pairs define a mapping of locations in the distribution
diff --git a/vp8/common/arm/armv6/dc_only_idct_add_v6.asm b/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
new file mode 100644
index 0000000..1922728
--- /dev/null
+++ b/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
@@ -0,0 +1,67 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+ EXPORT |vp8_dc_only_idct_add_v6|
+
+ AREA |.text|, CODE, READONLY
+
+;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr,
+; unsigned char *dst_ptr, int pitch, int stride)
+; r0 input_dc
+; r1 pred_ptr
+; r2 dest_ptr
+; r3 pitch
+; sp stride
+
+|vp8_dc_only_idct_add_v6| PROC
+ stmdb sp!, {r4 - r7, lr}
+
+ add r0, r0, #4 ; input_dc += 4
+ ldr r12, c0x0000FFFF
+ ldr r4, [r1], r3
+ ldr r6, [r1], r3
+ and r0, r12, r0, asr #3 ; input_dc >> 3 + mask
+ ldr lr, [sp, #20]
+ orr r0, r0, r0, lsl #16 ; a1 | a1
+
+ uxtab16 r5, r0, r4 ; a1+2 | a1+0
+ uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1
+ uxtab16 r7, r0, r6
+ uxtab16 r6, r0, r6, ror #8
+ usat16 r5, #8, r5
+ usat16 r4, #8, r4
+ usat16 r7, #8, r7
+ usat16 r6, #8, r6
+ orr r5, r5, r4, lsl #8
+ orr r7, r7, r6, lsl #8
+ ldr r4, [r1], r3
+ ldr r6, [r1]
+ str r5, [r2], lr
+ str r7, [r2], lr
+
+ uxtab16 r5, r0, r4
+ uxtab16 r4, r0, r4, ror #8
+ uxtab16 r7, r0, r6
+ uxtab16 r6, r0, r6, ror #8
+ usat16 r5, #8, r5
+ usat16 r4, #8, r4
+ usat16 r7, #8, r7
+ usat16 r6, #8, r6
+ orr r5, r5, r4, lsl #8
+ orr r7, r7, r6, lsl #8
+ str r5, [r2], lr
+ str r7, [r2]
+
+ ldmia sp!, {r4 - r7, pc}
+
+ ENDP ; |vp8_dc_only_idct_add_v6|
+
+; Constant Pool
+c0x0000FFFF DCD 0x0000FFFF
+ END
diff --git a/vp8/common/arm/armv6/idct_v6.asm b/vp8/common/arm/armv6/idct_v6.asm
index d9913c7..d96908c 100644
--- a/vp8/common/arm/armv6/idct_v6.asm
+++ b/vp8/common/arm/armv6/idct_v6.asm
@@ -15,8 +15,6 @@
EXPORT |vp8_short_idct4x4llm_v6_scott|
EXPORT |vp8_short_idct4x4llm_v6_dual|
- EXPORT |vp8_dc_only_idct_armv6|
-
AREA |.text|, CODE, READONLY
;********************************************************************************
@@ -344,34 +342,4 @@
ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
ENDP
-
-; sjl added 10/17/08
-;void dc_only_idct_armv6(short input_dc, short *output, int pitch)
-|vp8_dc_only_idct_armv6| PROC
- stmdb sp!, {r4 - r6, lr}
-
- add r0, r0, #0x4
- add r4, r1, r2 ; output + shortpitch
- mov r0, r0, ASR #0x3 ;aka a1
- add r5, r1, r2, LSL #1 ; output + shortpitch * 2
- pkhbt r0, r0, r0, lsl #16 ; a1 | a1
- add r6, r5, r2 ; output + shortpitch * 3
-
- str r0, [r1, #0]
- str r0, [r1, #4]
-
- str r0, [r4, #0]
- str r0, [r4, #4]
-
- str r0, [r5, #0]
- str r0, [r5, #4]
-
- str r0, [r6, #0]
- str r0, [r6, #4]
-
-
- ldmia sp!, {r4 - r6, pc}
-
- ENDP ; |vp8_dc_only_idct_armv6|
-
END
diff --git a/vp8/common/arm/armv6/iwalsh_v6.asm b/vp8/common/arm/armv6/iwalsh_v6.asm
index f4002b2..cab6bc9 100644
--- a/vp8/common/arm/armv6/iwalsh_v6.asm
+++ b/vp8/common/arm/armv6/iwalsh_v6.asm
@@ -8,8 +8,8 @@
; be found in the AUTHORS file in the root of the source tree.
;
- EXPORT |vp8_short_inv_walsh4x4_armv6|
- EXPORT |vp8_short_inv_walsh4x4_1_armv6|
+ EXPORT |vp8_short_inv_walsh4x4_v6|
+ EXPORT |vp8_short_inv_walsh4x4_1_v6|
ARM
REQUIRE8
@@ -17,8 +17,8 @@
AREA |.text|, CODE, READONLY ; name this block of code
-;short vp8_short_inv_walsh4x4_armv6(short *input, short *output)
-|vp8_short_inv_walsh4x4_armv6| PROC
+;short vp8_short_inv_walsh4x4_v6(short *input, short *output)
+|vp8_short_inv_walsh4x4_v6| PROC
stmdb sp!, {r4 - r11, lr}
@@ -123,11 +123,11 @@
str r5, [r1]
ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_short_inv_walsh4x4_armv6|
+ ENDP ; |vp8_short_inv_walsh4x4_v6|
-;short vp8_short_inv_walsh4x4_1_armv6(short *input, short *output)
-|vp8_short_inv_walsh4x4_1_armv6| PROC
+;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)
+|vp8_short_inv_walsh4x4_1_v6| PROC
ldrsh r2, [r0] ; [0]
add r2, r2, #3 ; [0] + 3
@@ -145,7 +145,7 @@
str r2, [r1]
bx lr
- ENDP ; |vp8_short_inv_walsh4x4_1_armv6|
+ ENDP ; |vp8_short_inv_walsh4x4_1_v6|
; Constant Pool
c0x00030003 DCD 0x00030003
diff --git a/vp8/common/arm/idct_arm.h b/vp8/common/arm/idct_arm.h
index 87d888d..6d917c4 100644
--- a/vp8/common/arm/idct_arm.h
+++ b/vp8/common/arm/idct_arm.h
@@ -15,9 +15,9 @@
#if HAVE_ARMV6
extern prototype_idct(vp8_short_idct4x4llm_1_v6);
extern prototype_idct(vp8_short_idct4x4llm_v6_dual);
-extern prototype_idct_scalar(vp8_dc_only_idct_armv6);
-extern prototype_second_order(vp8_short_inv_walsh4x4_1_armv6);
-extern prototype_second_order(vp8_short_inv_walsh4x4_armv6);
+extern prototype_idct_scalar_add(vp8_dc_only_idct_add_v6);
+extern prototype_second_order(vp8_short_inv_walsh4x4_1_v6);
+extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
#undef vp8_idct_idct1
#define vp8_idct_idct1 vp8_short_idct4x4llm_1_v6
@@ -25,20 +25,20 @@
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_v6_dual
-#undef vp8_idct_idct1_scalar
-#define vp8_idct_idct1_scalar vp8_dc_only_idct_armv6
+#undef vp8_idct_idct1_scalar_add
+#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6
#undef vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_armv6
+#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_v6
#undef vp8_idct_iwalsh16
-#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_armv6
+#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6
#endif
#if HAVE_ARMV7
extern prototype_idct(vp8_short_idct4x4llm_1_neon);
extern prototype_idct(vp8_short_idct4x4llm_neon);
-extern prototype_idct_scalar(vp8_dc_only_idct_neon);
+extern prototype_idct_scalar_add(vp8_dc_only_idct_add_neon);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon);
extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
@@ -48,8 +48,8 @@
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_neon
-#undef vp8_idct_idct1_scalar
-#define vp8_idct_idct1_scalar vp8_dc_only_idct_neon
+#undef vp8_idct_idct1_scalar_add
+#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon
#undef vp8_idct_iwalsh1
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon
diff --git a/vp8/common/arm/neon/dc_only_idct_add_neon.asm b/vp8/common/arm/neon/dc_only_idct_add_neon.asm
new file mode 100644
index 0000000..e6f141f
--- /dev/null
+++ b/vp8/common/arm/neon/dc_only_idct_add_neon.asm
@@ -0,0 +1,49 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_dc_only_idct_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr,
+; unsigned char *dst_ptr, int pitch, int stride)
+; r0 input_dc
+; r1 pred_ptr
+; r2 dst_ptr
+; r3 pitch
+; sp stride
+|vp8_dc_only_idct_add_neon| PROC
+ add r0, r0, #4
+ asr r0, r0, #3
+ ldr r12, [sp]
+ vdup.16 q0, r0
+
+ vld1.32 {d2[0]}, [r1], r3
+ vld1.32 {d2[1]}, [r1], r3
+ vld1.32 {d4[0]}, [r1], r3
+ vld1.32 {d4[1]}, [r1]
+
+ vaddw.u8 q1, q0, d2
+ vaddw.u8 q2, q0, d4
+
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d4, q2
+
+ vst1.32 {d2[0]}, [r2], r12
+ vst1.32 {d2[1]}, [r2], r12
+ vst1.32 {d4[0]}, [r2], r12
+ vst1.32 {d4[1]}, [r2]
+
+ bx lr
+
+ ENDP
+ END
diff --git a/vp8/common/arm/systemdependent.c b/vp8/common/arm/systemdependent.c
index bd6d146..b58cd78 100644
--- a/vp8/common/arm/systemdependent.c
+++ b/vp8/common/arm/systemdependent.c
@@ -43,7 +43,6 @@
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_neon;
rtcd->idct.idct16 = vp8_short_idct4x4llm_neon;
- rtcd->idct.idct1_scalar = vp8_dc_only_idct_neon;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_neon;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_neon;
@@ -75,7 +74,6 @@
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_v6;
rtcd->idct.idct16 = vp8_short_idct4x4llm_v6_dual;
- rtcd->idct.idct1_scalar = vp8_dc_only_idct_armv6;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_armv6;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_armv6;
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index 5f7f943..2fbeaee 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -32,7 +32,7 @@
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_c;
rtcd->idct.idct16 = vp8_short_idct4x4llm_c;
- rtcd->idct.idct1_scalar = vp8_dc_only_idct_c;
+ rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_c;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_c;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_c;
diff --git a/vp8/common/idct.h b/vp8/common/idct.h
index adf7771..7b04799 100644
--- a/vp8/common/idct.h
+++ b/vp8/common/idct.h
@@ -18,8 +18,10 @@
#define prototype_idct(sym) \
void sym(short *input, short *output, int pitch)
-#define prototype_idct_scalar(sym) \
- void sym(short input, short *output, int pitch)
+#define prototype_idct_scalar_add(sym) \
+ void sym(short input, \
+ unsigned char *pred, unsigned char *output, \
+ int pitch, int stride)
#if ARCH_X86 || ARCH_X86_64
#include "x86/idct_x86.h"
@@ -39,10 +41,10 @@
#endif
extern prototype_idct(vp8_idct_idct16);
-#ifndef vp8_idct_idct1_scalar
-#define vp8_idct_idct1_scalar vp8_dc_only_idct_c
+#ifndef vp8_idct_idct1_scalar_add
+#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_c
#endif
-extern prototype_idct_scalar(vp8_idct_idct1_scalar);
+extern prototype_idct_scalar_add(vp8_idct_idct1_scalar_add);
#ifndef vp8_idct_iwalsh1
@@ -56,14 +58,14 @@
extern prototype_second_order(vp8_idct_iwalsh16);
typedef prototype_idct((*vp8_idct_fn_t));
-typedef prototype_idct_scalar((*vp8_idct_scalar_fn_t));
+typedef prototype_idct_scalar_add((*vp8_idct_scalar_add_fn_t));
typedef prototype_second_order((*vp8_second_order_fn_t));
typedef struct
{
- vp8_idct_fn_t idct1;
- vp8_idct_fn_t idct16;
- vp8_idct_scalar_fn_t idct1_scalar;
+ vp8_idct_fn_t idct1;
+ vp8_idct_fn_t idct16;
+ vp8_idct_scalar_add_fn_t idct1_scalar_add;
vp8_second_order_fn_t iwalsh1;
vp8_second_order_fn_t iwalsh16;
diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c
index 2866fa4..c968662 100644
--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c
@@ -104,23 +104,30 @@
}
}
-
-void vp8_dc_only_idct_c(short input_dc, short *output, int pitch)
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
{
- int i;
- int a1;
- short *op = output;
- int shortpitch = pitch >> 1;
- a1 = ((input_dc + 4) >> 3);
+ int a1 = ((input_dc + 4) >> 3);
+ int r, c;
- for (i = 0; i < 4; i++)
+ for (r = 0; r < 4; r++)
{
- op[0] = a1;
- op[1] = a1;
- op[2] = a1;
- op[3] = a1;
- op += shortpitch;
+ for (c = 0; c < 4; c++)
+ {
+ int a = a1 + pred_ptr[c] ;
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dst_ptr[c] = (unsigned char) a ;
+ }
+
+ dst_ptr += stride;
+ pred_ptr += pitch;
}
+
}
void vp8_short_inv_walsh4x4_c(short *input, short *output)
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index 1670a1a..6488921 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -330,13 +330,6 @@
}
-
-//Notes: It is better to change CHAR to unsigned or signed to
-//avoid error on ARM platform.
-char vp8_an[8][64][3072];
-int vp8_cd[8][64];
-
-
double vp8_gaussian(double sigma, double mu, double x)
{
return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
diff --git a/vp8/common/x86/idct_x86.h b/vp8/common/x86/idct_x86.h
index 0ad8f55..f1e433d 100644
--- a/vp8/common/x86/idct_x86.h
+++ b/vp8/common/x86/idct_x86.h
@@ -22,7 +22,7 @@
#if HAVE_MMX
extern prototype_idct(vp8_short_idct4x4llm_1_mmx);
extern prototype_idct(vp8_short_idct4x4llm_mmx);
-extern prototype_idct_scalar(vp8_dc_only_idct_mmx);
+extern prototype_idct_scalar_add(vp8_dc_only_idct_add_mmx);
extern prototype_second_order(vp8_short_inv_walsh4x4_mmx);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
@@ -34,8 +34,8 @@
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_mmx
-#undef vp8_idct_idct1_scalar
-#define vp8_idct_idct1_scalar vp8_dc_only_idct_mmx
+#undef vp8_idct_idct1_scalar_add
+#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_mmx
#undef vp8_idct_iwalsh16
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_mmx
diff --git a/vp8/common/x86/idctllm_mmx.asm b/vp8/common/x86/idctllm_mmx.asm
index 1b18bd3..b0d4a0c 100644
--- a/vp8/common/x86/idctllm_mmx.asm
+++ b/vp8/common/x86/idctllm_mmx.asm
@@ -220,35 +220,61 @@
pop rbp
ret
-;void dc_only_idct_mmx(short input_dc, short *output, int pitch)
-global sym(vp8_dc_only_idct_mmx)
-sym(vp8_dc_only_idct_mmx):
+;void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
+global sym(vp8_dc_only_idct_add_mmx)
+sym(vp8_dc_only_idct_add_mmx):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
+ SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
+ push rsi
+ push rdi
; end prolog
- movd mm0, arg(0) ;input_dc
+ mov rsi, arg(1) ;s -- prediction
+ mov rdi, arg(2) ;d -- destination
+ movsxd rax, dword ptr arg(4) ;stride
+ movsxd rdx, dword ptr arg(3) ;pitch
+ pxor mm0, mm0
- paddw mm0, [fours GLOBAL]
- mov rdx, arg(1) ;output
+ movd mm5, arg(0) ;input_dc
- psraw mm0, 3
- movsxd rax, dword ptr arg(2) ;pitch
+ paddw mm5, [fours GLOBAL]
- punpcklwd mm0, mm0
- punpckldq mm0, mm0
+ psraw mm5, 3
- movq [rdx], mm0
- movq [rdx+rax], mm0
+ punpcklwd mm5, mm5
+ punpckldq mm5, mm5
- movq [rdx+rax*2], mm0
- add rdx, rax
+ movd mm1, [rsi]
+ punpcklbw mm1, mm0
+ paddsw mm1, mm5
+ packuswb mm1, mm0 ; pack and unpack to saturate
+ movd [rdi], mm1
- movq [rdx+rax*2], mm0
+ movd mm2, [rsi+rdx]
+ punpcklbw mm2, mm0
+ paddsw mm2, mm5
+ packuswb mm2, mm0 ; pack and unpack to saturate
+ movd [rdi+rax], mm2
+
+ movd mm3, [rsi+2*rdx]
+ punpcklbw mm3, mm0
+ paddsw mm3, mm5
+ packuswb mm3, mm0 ; pack and unpack to saturate
+ movd [rdi+2*rax], mm3
+
+ add rdi, rax
+ add rsi, rdx
+ movd mm4, [rsi+2*rdx]
+ punpcklbw mm4, mm0
+ paddsw mm4, mm5
+ packuswb mm4, mm0 ; pack and unpack to saturate
+ movd [rdi+2*rax], mm4
; begin epilog
+ pop rdi
+ pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
diff --git a/vp8/common/x86/subpixel_mmx.asm b/vp8/common/x86/subpixel_mmx.asm
index b0008fc..b8a7127 100644
--- a/vp8/common/x86/subpixel_mmx.asm
+++ b/vp8/common/x86/subpixel_mmx.asm
@@ -731,7 +731,7 @@
times 4 dw 0x40
align 16
-global sym(vp8_six_tap_mmx) HIDDEN_DATA
+global HIDDEN_DATA(sym(vp8_six_tap_mmx))
sym(vp8_six_tap_mmx):
times 8 dw 0
times 8 dw 0
@@ -791,7 +791,7 @@
align 16
-global sym(vp8_bilinear_filters_mmx) HIDDEN_DATA
+global HIDDEN_DATA(sym(vp8_bilinear_filters_mmx))
sym(vp8_bilinear_filters_mmx):
times 8 dw 128
times 8 dw 0
diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c
index 677eafa..66738f8 100644
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -42,7 +42,7 @@
{
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_mmx;
rtcd->idct.idct16 = vp8_short_idct4x4llm_mmx;
- rtcd->idct.idct1_scalar = vp8_dc_only_idct_mmx;
+ rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_mmx;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_mmx;
diff --git a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm b/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
new file mode 100644
index 0000000..886873c
--- /dev/null
+++ b/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
@@ -0,0 +1,218 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_dequant_dc_idct_add_v6|
+
+ AREA |.text|, CODE, READONLY
+
+;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred,
+; unsigned char *dest, int pitch, int stride, int Dc)
+; r0 = input
+; r1 = dq
+; r2 = pred
+; r3 = dest
+; sp + 36 = pitch ; +4 = 40
+; sp + 40 = stride ; +4 = 44
+; sp + 44 = Dc ; +4 = 48
+
+
+|vp8_dequant_dc_idct_add_v6| PROC
+ stmdb sp!, {r4-r11, lr}
+
+ ldr r6, [sp, #44]
+
+ ldr r4, [r0] ;input
+ ldr r5, [r1], #4 ;dq
+
+ sub sp, sp, #4
+ str r3, [sp]
+
+ smultt r7, r4, r5
+
+ ldr r4, [r0, #4] ;input
+ ldr r5, [r1], #4 ;dq
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ smulbb r6, r4, r5
+ smultt r7, r4, r5
+
+ ldr r4, [r0, #4] ;input
+ ldr r5, [r1], #4 ;dq
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ mov r12, #3
+
+vp8_dequant_dc_add_loop
+ smulbb r6, r4, r5
+ smultt r7, r4, r5
+
+ ldr r4, [r0, #4] ;input
+ ldr r5, [r1], #4 ;dq
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ smulbb r6, r4, r5
+ smultt r7, r4, r5
+
+ subs r12, r12, #1
+
+ ldrne r4, [r0, #4]
+ ldrne r5, [r1], #4
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ bne vp8_dequant_dc_add_loop
+
+ sub r0, r0, #32
+ mov r1, r0
+
+; short_idct4x4llm_v6_dual
+ ldr r3, cospi8sqrt2minus1
+ ldr r4, sinpi8sqrt2
+ ldr r6, [r0, #8]
+ mov r5, #2
+vp8_dequant_dc_idct_loop1_v6
+ ldr r12, [r0, #24]
+ ldr r14, [r0, #16]
+ smulwt r9, r3, r6
+ smulwb r7, r3, r6
+ smulwt r10, r4, r6
+ smulwb r8, r4, r6
+ pkhbt r7, r7, r9, lsl #16
+ smulwt r11, r3, r12
+ pkhbt r8, r8, r10, lsl #16
+ uadd16 r6, r6, r7
+ smulwt r7, r4, r12
+ smulwb r9, r3, r12
+ smulwb r10, r4, r12
+ subs r5, r5, #1
+ pkhbt r9, r9, r11, lsl #16
+ ldr r11, [r0], #4
+ pkhbt r10, r10, r7, lsl #16
+ uadd16 r7, r12, r9
+ usub16 r7, r8, r7
+ uadd16 r6, r6, r10
+ uadd16 r10, r11, r14
+ usub16 r8, r11, r14
+ uadd16 r9, r10, r6
+ usub16 r10, r10, r6
+ uadd16 r6, r8, r7
+ usub16 r7, r8, r7
+ str r6, [r1, #8]
+ ldrne r6, [r0, #8]
+ str r7, [r1, #16]
+ str r10, [r1, #24]
+ str r9, [r1], #4
+ bne vp8_dequant_dc_idct_loop1_v6
+
+ mov r5, #2
+ sub r0, r1, #8
+vp8_dequant_dc_idct_loop2_v6
+ ldr r6, [r0], #4
+ ldr r7, [r0], #4
+ ldr r8, [r0], #4
+ ldr r9, [r0], #4
+ smulwt r1, r3, r6
+ smulwt r12, r4, r6
+ smulwt lr, r3, r8
+ smulwt r10, r4, r8
+ pkhbt r11, r8, r6, lsl #16
+ pkhbt r1, lr, r1, lsl #16
+ pkhbt r12, r10, r12, lsl #16
+ pkhtb r6, r6, r8, asr #16
+ uadd16 r6, r1, r6
+ pkhbt lr, r9, r7, lsl #16
+ uadd16 r10, r11, lr
+ usub16 lr, r11, lr
+ pkhtb r8, r7, r9, asr #16
+ subs r5, r5, #1
+ smulwt r1, r3, r8
+ smulwb r7, r3, r8
+ smulwt r11, r4, r8
+ smulwb r9, r4, r8
+ pkhbt r1, r7, r1, lsl #16
+ uadd16 r8, r1, r8
+ pkhbt r11, r9, r11, lsl #16
+ usub16 r1, r12, r8
+ uadd16 r8, r11, r6
+ ldr r9, c0x00040004
+ ldr r12, [sp, #40]
+ uadd16 r6, r10, r8
+ usub16 r7, r10, r8
+ uadd16 r7, r7, r9
+ uadd16 r6, r6, r9
+ uadd16 r10, r14, r1
+ usub16 r1, r14, r1
+ uadd16 r10, r10, r9
+ uadd16 r1, r1, r9
+ ldr r11, [r2], r12
+ mov r8, r7, asr #3
+ pkhtb r9, r8, r10, asr #19
+ mov r8, r1, asr #3
+ pkhtb r8, r8, r6, asr #19
+ uxtb16 lr, r11, ror #8
+ qadd16 r9, r9, lr
+ uxtb16 lr, r11
+ qadd16 r8, r8, lr
+ usat16 r9, #8, r9
+ usat16 r8, #8, r8
+ orr r9, r8, r9, lsl #8
+ ldr r11, [r2], r12
+ ldr lr, [sp]
+ ldr r12, [sp, #44]
+ mov r7, r7, lsl #16
+ mov r1, r1, lsl #16
+ mov r10, r10, lsl #16
+ mov r6, r6, lsl #16
+ mov r7, r7, asr #3
+ pkhtb r7, r7, r10, asr #19
+ mov r1, r1, asr #3
+ pkhtb r1, r1, r6, asr #19
+ uxtb16 r8, r11, ror #8
+ qadd16 r7, r7, r8
+ uxtb16 r8, r11
+ qadd16 r1, r1, r8
+ usat16 r7, #8, r7
+ usat16 r1, #8, r1
+ orr r1, r1, r7, lsl #8
+ str r9, [lr], r12
+ str r1, [lr], r12
+ str lr, [sp]
+ bne vp8_dequant_dc_idct_loop2_v6
+
+; vpx_memset
+ sub r0, r0, #32
+ add sp, sp, #4
+
+ mov r12, #0
+ str r12, [r0]
+ str r12, [r0, #4]
+ str r12, [r0, #8]
+ str r12, [r0, #12]
+ str r12, [r0, #16]
+ str r12, [r0, #20]
+ str r12, [r0, #24]
+ str r12, [r0, #28]
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_dequant_dc_idct_add_v6|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x00004E7B
+sinpi8sqrt2 DCD 0x00008A8C
+c0x00040004 DCD 0x00040004
+
+ END
diff --git a/vp8/decoder/arm/armv6/dequant_idct_v6.asm b/vp8/decoder/arm/armv6/dequant_idct_v6.asm
new file mode 100644
index 0000000..c13b512
--- /dev/null
+++ b/vp8/decoder/arm/armv6/dequant_idct_v6.asm
@@ -0,0 +1,196 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+ EXPORT |vp8_dequant_idct_add_v6|
+
+ AREA |.text|, CODE, READONLY
+;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred,
+; unsigned char *dest, int pitch, int stride)
+; r0 = input
+; r1 = dq
+; r2 = pred
+; r3 = dest
+; sp + 36 = pitch ; +4 = 40
+; sp + 40 = stride ; +4 = 44
+
+
+|vp8_dequant_idct_add_v6| PROC
+ stmdb sp!, {r4-r11, lr}
+
+ ldr r4, [r0] ;input
+ ldr r5, [r1], #4 ;dq
+
+ sub sp, sp, #4
+ str r3, [sp]
+
+ mov r12, #4
+
+vp8_dequant_add_loop
+ smulbb r6, r4, r5
+ smultt r7, r4, r5
+
+ ldr r4, [r0, #4] ;input
+ ldr r5, [r1], #4 ;dq
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ smulbb r6, r4, r5
+ smultt r7, r4, r5
+
+ subs r12, r12, #1
+
+ ldrne r4, [r0, #4]
+ ldrne r5, [r1], #4
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ bne vp8_dequant_add_loop
+
+ sub r0, r0, #32
+ mov r1, r0
+
+; short_idct4x4llm_v6_dual
+ ldr r3, cospi8sqrt2minus1
+ ldr r4, sinpi8sqrt2
+ ldr r6, [r0, #8]
+ mov r5, #2
+vp8_dequant_idct_loop1_v6
+ ldr r12, [r0, #24]
+ ldr r14, [r0, #16]
+ smulwt r9, r3, r6
+ smulwb r7, r3, r6
+ smulwt r10, r4, r6
+ smulwb r8, r4, r6
+ pkhbt r7, r7, r9, lsl #16
+ smulwt r11, r3, r12
+ pkhbt r8, r8, r10, lsl #16
+ uadd16 r6, r6, r7
+ smulwt r7, r4, r12
+ smulwb r9, r3, r12
+ smulwb r10, r4, r12
+ subs r5, r5, #1
+ pkhbt r9, r9, r11, lsl #16
+ ldr r11, [r0], #4
+ pkhbt r10, r10, r7, lsl #16
+ uadd16 r7, r12, r9
+ usub16 r7, r8, r7
+ uadd16 r6, r6, r10
+ uadd16 r10, r11, r14
+ usub16 r8, r11, r14
+ uadd16 r9, r10, r6
+ usub16 r10, r10, r6
+ uadd16 r6, r8, r7
+ usub16 r7, r8, r7
+ str r6, [r1, #8]
+ ldrne r6, [r0, #8]
+ str r7, [r1, #16]
+ str r10, [r1, #24]
+ str r9, [r1], #4
+ bne vp8_dequant_idct_loop1_v6
+
+ mov r5, #2
+ sub r0, r1, #8
+vp8_dequant_idct_loop2_v6
+ ldr r6, [r0], #4
+ ldr r7, [r0], #4
+ ldr r8, [r0], #4
+ ldr r9, [r0], #4
+ smulwt r1, r3, r6
+ smulwt r12, r4, r6
+ smulwt lr, r3, r8
+ smulwt r10, r4, r8
+ pkhbt r11, r8, r6, lsl #16
+ pkhbt r1, lr, r1, lsl #16
+ pkhbt r12, r10, r12, lsl #16
+ pkhtb r6, r6, r8, asr #16
+ uadd16 r6, r1, r6
+ pkhbt lr, r9, r7, lsl #16
+ uadd16 r10, r11, lr
+ usub16 lr, r11, lr
+ pkhtb r8, r7, r9, asr #16
+ subs r5, r5, #1
+ smulwt r1, r3, r8
+ smulwb r7, r3, r8
+ smulwt r11, r4, r8
+ smulwb r9, r4, r8
+ pkhbt r1, r7, r1, lsl #16
+ uadd16 r8, r1, r8
+ pkhbt r11, r9, r11, lsl #16
+ usub16 r1, r12, r8
+ uadd16 r8, r11, r6
+ ldr r9, c0x00040004
+ ldr r12, [sp, #40]
+ uadd16 r6, r10, r8
+ usub16 r7, r10, r8
+ uadd16 r7, r7, r9
+ uadd16 r6, r6, r9
+ uadd16 r10, r14, r1
+ usub16 r1, r14, r1
+ uadd16 r10, r10, r9
+ uadd16 r1, r1, r9
+ ldr r11, [r2], r12
+ mov r8, r7, asr #3
+ pkhtb r9, r8, r10, asr #19
+ mov r8, r1, asr #3
+ pkhtb r8, r8, r6, asr #19
+ uxtb16 lr, r11, ror #8
+ qadd16 r9, r9, lr
+ uxtb16 lr, r11
+ qadd16 r8, r8, lr
+ usat16 r9, #8, r9
+ usat16 r8, #8, r8
+ orr r9, r8, r9, lsl #8
+ ldr r11, [r2], r12
+ ldr lr, [sp]
+ ldr r12, [sp, #44]
+ mov r7, r7, lsl #16
+ mov r1, r1, lsl #16
+ mov r10, r10, lsl #16
+ mov r6, r6, lsl #16
+ mov r7, r7, asr #3
+ pkhtb r7, r7, r10, asr #19
+ mov r1, r1, asr #3
+ pkhtb r1, r1, r6, asr #19
+ uxtb16 r8, r11, ror #8
+ qadd16 r7, r7, r8
+ uxtb16 r8, r11
+ qadd16 r1, r1, r8
+ usat16 r7, #8, r7
+ usat16 r1, #8, r1
+ orr r1, r1, r7, lsl #8
+ str r9, [lr], r12
+ str r1, [lr], r12
+ str lr, [sp]
+ bne vp8_dequant_idct_loop2_v6
+
+; vpx_memset
+ sub r0, r0, #32
+ add sp, sp, #4
+
+ mov r12, #0
+ str r12, [r0]
+ str r12, [r0, #4]
+ str r12, [r0, #8]
+ str r12, [r0, #12]
+ str r12, [r0, #16]
+ str r12, [r0, #20]
+ str r12, [r0, #24]
+ str r12, [r0, #28]
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_dequant_idct_add_v6|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x00004E7B
+sinpi8sqrt2 DCD 0x00008A8C
+c0x00040004 DCD 0x00040004
+
+ END
diff --git a/vp8/decoder/arm/armv6/dequantdcidct_v6.asm b/vp8/decoder/arm/armv6/dequantdcidct_v6.asm
deleted file mode 100644
index 0252872..0000000
--- a/vp8/decoder/arm/armv6/dequantdcidct_v6.asm
+++ /dev/null
@@ -1,203 +0,0 @@
-;
-; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_dequant_dc_idct_v6|
- ; ARM
- ; REQUIRE8
- ; PRESERVE8
-
- AREA |.text|, CODE, READONLY ; name this block of code
-;void vp8_dequant_dc_idct_v6(short *input, short *dq, short *output, int pitch,int Dc)
-|vp8_dequant_dc_idct_v6| PROC
- stmdb sp!, {r4-r11, lr}
-
- ldr r6, [sp, #36] ;load Dc
-
- ldr r4, [r0] ;input
- ldr r5, [r1], #4 ;dq
-
- sub sp, sp, #4
- str r0, [sp]
-
- smultt r7, r4, r5
-
- ldr r4, [r0, #4] ;input
- ldr r5, [r1], #4 ;dq
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- smulbb r6, r4, r5
- smultt r7, r4, r5
-
- ldr r4, [r0, #4] ;input
- ldr r5, [r1], #4 ;dq
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- mov r12, #3
-
-dequant_dc_idct_loop
- smulbb r6, r4, r5
- smultt r7, r4, r5
-
- ldr r4, [r0, #4] ;input
- ldr r5, [r1], #4 ;dq
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- smulbb r6, r4, r5
- smultt r7, r4, r5
-
- subs r12, r12, #1
-
- ldrne r4, [r0, #4]
- ldrne r5, [r1], #4
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- bne dequant_dc_idct_loop
-
- sub r0, r0, #32
- mov r1, r2
- mov r2, r3
-
-; short_idct4x4llm_v6_dual
-
- mov r3, #0x00004E00 ; cos
- orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
- mov r4, #0x00008A00 ; sin
- orr r4, r4, #0x0000008C ; sinpi8sqrt2
- mov r5, #0x2 ; i=2 i
-loop1_dual_11
- ldr r6, [r0, #(4*2)] ; i5 | i4 5|4
- ldr r12, [r0, #(12*2)] ; i13 | i12 13|12
- ldr r14, [r0, #(8*2)] ; i9 | i8 9|8
-
- smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
- smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c
- smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
- smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s
- pkhbt r7, r7, r9, lsl #16 ; 5c | 4c
- smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c
- pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
- uadd16 r6, r6, r7 ; 5c+5 | 4c+4
- smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s
- smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c
- smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s
- subs r5, r5, #0x1 ; i-- --
- pkhbt r9, r9, r11, lsl #16 ; 13c | 12c
- ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0
- pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
- uadd16 r7, r12, r9 ; 13c+13 | 12c+12
- usub16 r7, r8, r7 ; c c
- uadd16 r6, r6, r10 ; d d
- uadd16 r10, r11, r14 ; a a
- usub16 r8, r11, r14 ; b b
- uadd16 r9, r10, r6 ; a+d a+d
- usub16 r10, r10, r6 ; a-d a-d
- uadd16 r6, r8, r7 ; b+c b+c
- usub16 r7, r8, r7 ; b-c b-c
- str r6, [r1, r2] ; o5 | o4
- add r6, r2, r2 ; pitch * 2 p2
- str r7, [r1, r6] ; o9 | o8
- add r6, r6, r2 ; pitch * 3 p3
- str r10, [r1, r6] ; o13 | o12
- str r9, [r1], #0x4 ; o1 | o0 ++
- bne loop1_dual_11 ;
- mov r5, #0x2 ; i=2 i
- sub r0, r1, #8 ; reset input/output i/o
-loop2_dual_22
- ldr r6, [r0, r2] ; i5 | i4 5|4
- ldr r1, [r0] ; i1 | i0 1|0
- ldr r12, [r0, #0x4] ; i3 | i2 3|2
- add r14, r2, #0x4 ; pitch + 2 p+2
- ldr r14, [r0, r14] ; i7 | i6 7|6
- smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
- smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c
- smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
- smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s
- pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4
- pkhbt r7, r9, r7, lsl #16 ; 1c | 5c
- pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1
- pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5
- uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2
- pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6
- uadd16 r10, r11, r9 ; a a
- usub16 r9, r11, r9 ; b b
- pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7
- subs r5, r5, #0x1 ; i-- --
- smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c
- smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s
- smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c
- smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s
-
- pkhbt r7, r12, r7, lsl #16 ; 3c | 7c
- pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1
- uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2
- usub16 r12, r8, r6 ; c (o1 | o5) c
- uadd16 r6, r11, r1 ; d (o3 | o7) d
- uadd16 r7, r10, r6 ; a+d a+d
- mov r8, #0x4 ; set up 4's 4
- orr r8, r8, #0x40000 ; 4|4
- usub16 r6, r10, r6 ; a-d a-d
- uadd16 r6, r6, r8 ; a-d+4 3|7
- uadd16 r7, r7, r8 ; a+d+4 0|4
- uadd16 r10, r9, r12 ; b+c b+c
- usub16 r1, r9, r12 ; b-c b-c
- uadd16 r10, r10, r8 ; b+c+4 1|5
- uadd16 r1, r1, r8 ; b-c+4 2|6
- mov r8, r10, asr #19 ; o1 >> 3
- strh r8, [r0, #2] ; o1
- mov r8, r1, asr #19 ; o2 >> 3
- strh r8, [r0, #4] ; o2
- mov r8, r6, asr #19 ; o3 >> 3
- strh r8, [r0, #6] ; o3
- mov r8, r7, asr #19 ; o0 >> 3
- strh r8, [r0], r2 ; o0 +p
- sxth r10, r10 ;
- mov r8, r10, asr #3 ; o5 >> 3
- strh r8, [r0, #2] ; o5
- sxth r1, r1 ;
- mov r8, r1, asr #3 ; o6 >> 3
- strh r8, [r0, #4] ; o6
- sxth r6, r6 ;
- mov r8, r6, asr #3 ; o7 >> 3
- strh r8, [r0, #6] ; o7
- sxth r7, r7 ;
- mov r8, r7, asr #3 ; o4 >> 3
- strh r8, [r0], r2 ; o4 +p
-;;;;; subs r5, r5, #0x1 ; i-- --
- bne loop2_dual_22 ;
-
-
-;vpx_memset
- ldr r0, [sp]
- add sp, sp, #4
-
- mov r12, #0
- str r12, [r0]
- str r12, [r0, #4]
- str r12, [r0, #8]
- str r12, [r0, #12]
- str r12, [r0, #16]
- str r12, [r0, #20]
- str r12, [r0, #24]
- str r12, [r0, #28]
-
- ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
-
- ENDP ;|vp8_dequant_dc_idct_v68|
-
- END
diff --git a/vp8/decoder/arm/armv6/dequantidct_v6.asm b/vp8/decoder/arm/armv6/dequantidct_v6.asm
deleted file mode 100644
index 15e4c68..0000000
--- a/vp8/decoder/arm/armv6/dequantidct_v6.asm
+++ /dev/null
@@ -1,184 +0,0 @@
-;
-; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_dequant_idct_v6|
- ; ARM
- ; REQUIRE8
- ; PRESERVE8
-
- AREA |.text|, CODE, READONLY ; name this block of code
-;void vp8_dequant_idct_v6(short *input, short *dq, short *output, int pitch)
-|vp8_dequant_idct_v6| PROC
- stmdb sp!, {r4-r11, lr}
-
- ldr r4, [r0] ;input
- ldr r5, [r1], #4 ;dq
-
- sub sp, sp, #4
- str r0, [sp]
-
- mov r12, #4
-
-dequant_idct_loop
- smulbb r6, r4, r5
- smultt r7, r4, r5
-
- ldr r4, [r0, #4] ;input
- ldr r5, [r1], #4 ;dq
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- smulbb r6, r4, r5
- smultt r7, r4, r5
-
- subs r12, r12, #1
-
- ldrne r4, [r0, #4]
- ldrne r5, [r1], #4
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- bne dequant_idct_loop
-
- sub r0, r0, #32
- mov r1, r2
- mov r2, r3
-
-; short_idct4x4llm_v6_dual
-
- mov r3, #0x00004E00 ; cos
- orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
- mov r4, #0x00008A00 ; sin
- orr r4, r4, #0x0000008C ; sinpi8sqrt2
- mov r5, #0x2 ; i=2 i
-loop1_dual_1
- ldr r6, [r0, #(4*2)] ; i5 | i4 5|4
- ldr r12, [r0, #(12*2)] ; i13 | i12 13|12
- ldr r14, [r0, #(8*2)] ; i9 | i8 9|8
-
- smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
- smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c
- smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
- smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s
- pkhbt r7, r7, r9, lsl #16 ; 5c | 4c
- smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c
- pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
- uadd16 r6, r6, r7 ; 5c+5 | 4c+4
- smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s
- smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c
- smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s
- subs r5, r5, #0x1 ; i-- --
- pkhbt r9, r9, r11, lsl #16 ; 13c | 12c
- ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0
- pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
- uadd16 r7, r12, r9 ; 13c+13 | 12c+12
- usub16 r7, r8, r7 ; c c
- uadd16 r6, r6, r10 ; d d
- uadd16 r10, r11, r14 ; a a
- usub16 r8, r11, r14 ; b b
- uadd16 r9, r10, r6 ; a+d a+d
- usub16 r10, r10, r6 ; a-d a-d
- uadd16 r6, r8, r7 ; b+c b+c
- usub16 r7, r8, r7 ; b-c b-c
- str r6, [r1, r2] ; o5 | o4
- add r6, r2, r2 ; pitch * 2 p2
- str r7, [r1, r6] ; o9 | o8
- add r6, r6, r2 ; pitch * 3 p3
- str r10, [r1, r6] ; o13 | o12
- str r9, [r1], #0x4 ; o1 | o0 ++
- bne loop1_dual_1 ;
- mov r5, #0x2 ; i=2 i
- sub r0, r1, #8 ; reset input/output i/o
-loop2_dual_2
- ldr r6, [r0, r2] ; i5 | i4 5|4
- ldr r1, [r0] ; i1 | i0 1|0
- ldr r12, [r0, #0x4] ; i3 | i2 3|2
- add r14, r2, #0x4 ; pitch + 2 p+2
- ldr r14, [r0, r14] ; i7 | i6 7|6
- smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
- smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c
- smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
- smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s
- pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4
- pkhbt r7, r9, r7, lsl #16 ; 1c | 5c
- pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1
- pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5
- uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2
- pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6
- uadd16 r10, r11, r9 ; a a
- usub16 r9, r11, r9 ; b b
- pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7
- subs r5, r5, #0x1 ; i-- --
- smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c
- smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s
- smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c
- smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s
-
- pkhbt r7, r12, r7, lsl #16 ; 3c | 7c
- pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1
- uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2
- usub16 r12, r8, r6 ; c (o1 | o5) c
- uadd16 r6, r11, r1 ; d (o3 | o7) d
- uadd16 r7, r10, r6 ; a+d a+d
- mov r8, #0x4 ; set up 4's 4
- orr r8, r8, #0x40000 ; 4|4
- usub16 r6, r10, r6 ; a-d a-d
- uadd16 r6, r6, r8 ; a-d+4 3|7
- uadd16 r7, r7, r8 ; a+d+4 0|4
- uadd16 r10, r9, r12 ; b+c b+c
- usub16 r1, r9, r12 ; b-c b-c
- uadd16 r10, r10, r8 ; b+c+4 1|5
- uadd16 r1, r1, r8 ; b-c+4 2|6
- mov r8, r10, asr #19 ; o1 >> 3
- strh r8, [r0, #2] ; o1
- mov r8, r1, asr #19 ; o2 >> 3
- strh r8, [r0, #4] ; o2
- mov r8, r6, asr #19 ; o3 >> 3
- strh r8, [r0, #6] ; o3
- mov r8, r7, asr #19 ; o0 >> 3
- strh r8, [r0], r2 ; o0 +p
- sxth r10, r10 ;
- mov r8, r10, asr #3 ; o5 >> 3
- strh r8, [r0, #2] ; o5
- sxth r1, r1 ;
- mov r8, r1, asr #3 ; o6 >> 3
- strh r8, [r0, #4] ; o6
- sxth r6, r6 ;
- mov r8, r6, asr #3 ; o7 >> 3
- strh r8, [r0, #6] ; o7
- sxth r7, r7 ;
- mov r8, r7, asr #3 ; o4 >> 3
- strh r8, [r0], r2 ; o4 +p
-;;;;; subs r5, r5, #0x1 ; i-- --
- bne loop2_dual_2 ;
- ;
-
-;vpx_memset
- ldr r0, [sp]
- add sp, sp, #4
-
- mov r12, #0
- str r12, [r0]
- str r12, [r0, #4]
- str r12, [r0, #8]
- str r12, [r0, #12]
- str r12, [r0, #16]
- str r12, [r0, #20]
- str r12, [r0, #24]
- str r12, [r0, #28]
-
- ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
-
- ENDP ;|vp8_dequant_idct_v6|
-
- END
diff --git a/vp8/decoder/arm/dequantize_arm.h b/vp8/decoder/arm/dequantize_arm.h
index 31fc5d0..3a044f8 100644
--- a/vp8/decoder/arm/dequantize_arm.h
+++ b/vp8/decoder/arm/dequantize_arm.h
@@ -14,32 +14,32 @@
#if HAVE_ARMV6
extern prototype_dequant_block(vp8_dequantize_b_v6);
-extern prototype_dequant_idct(vp8_dequant_idct_v6);
-extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_v6);
+extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6);
+extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6);
#undef vp8_dequant_block
#define vp8_dequant_block vp8_dequantize_b_v6
-#undef vp8_dequant_idct
-#define vp8_dequant_idct vp8_dequant_idct_v6
+#undef vp8_dequant_idct_add
+#define vp8_dequant_idct_add vp8_dequant_idct_add_v6
-#undef vp8_dequant_idct_dc
-#define vp8_dequant_idct_dc vp8_dequant_dc_idct_v6
+#undef vp8_dequant_dc_idct_add
+#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_v6
#endif
#if HAVE_ARMV7
extern prototype_dequant_block(vp8_dequantize_b_neon);
-extern prototype_dequant_idct(vp8_dequant_idct_neon);
-extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_neon);
+extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon);
+extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon);
#undef vp8_dequant_block
#define vp8_dequant_block vp8_dequantize_b_neon
-#undef vp8_dequant_idct
-#define vp8_dequant_idct vp8_dequant_idct_neon
+#undef vp8_dequant_idct_add
+#define vp8_dequant_idct_add vp8_dequant_idct_add_neon
-#undef vp8_dequant_idct_dc
-#define vp8_dequant_idct_dc vp8_dequant_dc_idct_neon
+#undef vp8_dequant_dc_idct_add
+#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_neon
#endif
#endif
diff --git a/vp8/decoder/arm/dsystemdependent.c b/vp8/decoder/arm/dsystemdependent.c
index 1504216..2ea03c4 100644
--- a/vp8/decoder/arm/dsystemdependent.c
+++ b/vp8/decoder/arm/dsystemdependent.c
@@ -23,8 +23,6 @@
pbi->mb.rtcd = &pbi->common.rtcd;
#if HAVE_ARMV7
pbi->dequant.block = vp8_dequantize_b_neon;
- pbi->dequant.idct = vp8_dequant_idct_neon;
- pbi->dequant.idct_dc = vp8_dequant_dc_idct_neon;
pbi->dboolhuff.start = vp8dx_start_decode_c;
pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c;
pbi->dboolhuff.debool = vp8dx_decode_bool_c;
@@ -32,8 +30,6 @@
#elif HAVE_ARMV6
pbi->dequant.block = vp8_dequantize_b_v6;
- pbi->dequant.idct = vp8_dequant_idct_v6;
- pbi->dequant.idct_dc = vp8_dequant_dc_idct_v6;
pbi->dboolhuff.start = vp8dx_start_decode_c;
pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c;
pbi->dboolhuff.debool = vp8dx_decode_bool_c;
diff --git a/vp8/decoder/arm/neon/dequantdcidct_neon.asm b/vp8/decoder/arm/neon/dequant_dc_idct_neon.asm
similarity index 65%
rename from vp8/decoder/arm/neon/dequantdcidct_neon.asm
rename to vp8/decoder/arm/neon/dequant_dc_idct_neon.asm
index ae126f2..ddb3240 100644
--- a/vp8/decoder/arm/neon/dequantdcidct_neon.asm
+++ b/vp8/decoder/arm/neon/dequant_dc_idct_neon.asm
@@ -9,31 +9,43 @@
;
- EXPORT |vp8_dequant_dc_idct_neon|
+ EXPORT |vp8_dequant_dc_idct_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_dequant_dc_idct_c(short *input, short *dq, short *output, int pitch, int Dc);
+;void vp8_dequant_dc_idct_add_neon(short *input, short *dq, unsigned char *pred,
+; unsigned char *dest, int pitch, int stride,
+; int Dc);
; r0 short *input,
; r1 short *dq,
-; r2 short *output,
-; r3 int pitch,
-; (stack) int Dc
-|vp8_dequant_dc_idct_neon| PROC
+; r2 unsigned char *pred
+; r3 unsigned char *dest
+; sp int pitch
+; sp+4 int stride
+; sp+8 int Dc
+|vp8_dequant_dc_idct_add_neon| PROC
vld1.16 {q3, q4}, [r0]
vld1.16 {q5, q6}, [r1]
- ldr r1, [sp] ;load Dc from stack
+ ldr r1, [sp, #8] ;load Dc from stack
- ldr r12, _dcidct_coeff_
+ ldr r12, _CONSTANTS_
vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
vmul.i16 q2, q4, q6
vmov.16 d2[0], r1
+ ldr r1, [sp] ; pitch
+ vld1.32 {d14[0]}, [r2], r1
+ vld1.32 {d14[1]}, [r2], r1
+ vld1.32 {d15[0]}, [r2], r1
+ vld1.32 {d15[1]}, [r2]
+
+ ldr r1, [sp, #4] ; stride
+
;|short_idct4x4llm_neon| PROC
vld1.16 {d0}, [r12]
vswp d3, d4 ;q2(vp[4] vp[12])
@@ -47,14 +59,9 @@
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
- vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+ vqadd.s16 q3, q3, q2
vqadd.s16 q4, q4, q2
- ;d6 - c1:temp1
- ;d7 - d1:temp2
- ;d8 - d1:temp1
- ;d9 - c1:temp2
-
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
@@ -83,7 +90,7 @@
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
- vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+ vqadd.s16 q3, q3, q2
vqadd.s16 q4, q4, q2
vqsub.s16 d10, d6, d9 ;c1
@@ -101,34 +108,29 @@
vrshr.s16 d4, d4, #3
vrshr.s16 d5, d5, #3
- add r1, r2, r3
- add r12, r1, r3
- add r0, r12, r3
-
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
- vst1.16 {d2}, [r2]
- vst1.16 {d3}, [r1]
- vst1.16 {d4}, [r12]
- vst1.16 {d5}, [r0]
+ vaddw.u8 q1, q1, d14
+ vaddw.u8 q2, q2, d15
+
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
+
+ vst1.32 {d0[0]}, [r3], r1
+ vst1.32 {d0[1]}, [r3], r1
+ vst1.32 {d1[0]}, [r3], r1
+ vst1.32 {d1[1]}, [r3]
bx lr
- ENDP
+ ENDP ; |vp8_dequant_dc_idct_add_neon|
-;-----------------
- AREA dcidct4x4_dat, DATA, READWRITE ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_dcidct_coeff_
- DCD dcidct_coeff
-dcidct_coeff
- DCD 0x4e7b4e7b, 0x8a8c8a8c
-
-;20091, 20091, 35468, 35468
+; Constant Pool
+_CONSTANTS_ DCD cospi8sqrt2minus1
+cospi8sqrt2minus1 DCD 0x4e7b4e7b
+sinpi8sqrt2 DCD 0x8a8c8a8c
END
diff --git a/vp8/decoder/arm/neon/dequantidct_neon.asm b/vp8/decoder/arm/neon/dequant_idct_neon.asm
similarity index 67%
rename from vp8/decoder/arm/neon/dequantidct_neon.asm
rename to vp8/decoder/arm/neon/dequant_idct_neon.asm
index e0888ed..5c60dd6 100644
--- a/vp8/decoder/arm/neon/dequantidct_neon.asm
+++ b/vp8/decoder/arm/neon/dequant_idct_neon.asm
@@ -9,22 +9,33 @@
;
- EXPORT |vp8_dequant_idct_neon|
+ EXPORT |vp8_dequant_idct_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_dequant_idct_c(short *input, short *dq, short *output, int pitch);
+;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred,
+; unsigned char *dest, int pitch, int stride)
; r0 short *input,
; r1 short *dq,
-; r2 short *output,
-; r3 int pitch,
-|vp8_dequant_idct_neon| PROC
+; r2 unsigned char *pred
+; r3 unsigned char *dest
+; sp int pitch
+; sp+4 int stride
+
+|vp8_dequant_idct_add_neon| PROC
vld1.16 {q3, q4}, [r0]
vld1.16 {q5, q6}, [r1]
+ ldr r1, [sp] ; pitch
+ vld1.32 {d14[0]}, [r2], r1
+ vld1.32 {d14[1]}, [r2], r1
+ vld1.32 {d15[0]}, [r2], r1
+ vld1.32 {d15[1]}, [r2]
- ldr r12, _didct_coeff_
+ ldr r1, [sp, #4] ; stride
+
+ ldr r12, _CONSTANTS_
vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
vmul.i16 q2, q4, q6
@@ -42,14 +53,9 @@
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
- vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+ vqadd.s16 q3, q3, q2
vqadd.s16 q4, q4, q2
- ;d6 - c1:temp1
- ;d7 - d1:temp2
- ;d8 - d1:temp1
- ;d9 - c1:temp2
-
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
@@ -78,7 +84,7 @@
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
- vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+ vqadd.s16 q3, q3, q2
vqadd.s16 q4, q4, q2
vqsub.s16 d10, d6, d9 ;c1
@@ -96,34 +102,29 @@
vrshr.s16 d4, d4, #3
vrshr.s16 d5, d5, #3
- add r1, r2, r3
- add r12, r1, r3
- add r0, r12, r3
-
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
- vst1.16 {d2}, [r2]
- vst1.16 {d3}, [r1]
- vst1.16 {d4}, [r12]
- vst1.16 {d5}, [r0]
+ vaddw.u8 q1, q1, d14
+ vaddw.u8 q2, q2, d15
+
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
+
+ vst1.32 {d0[0]}, [r3], r1
+ vst1.32 {d0[1]}, [r3], r1
+ vst1.32 {d1[0]}, [r3], r1
+ vst1.32 {d1[1]}, [r3]
bx lr
- ENDP
+ ENDP ; |vp8_dequant_idct_add_neon|
-;-----------------
- AREA didct4x4_dat, DATA, READWRITE ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_didct_coeff_
- DCD didct_coeff
-didct_coeff
- DCD 0x4e7b4e7b, 0x8a8c8a8c
-
-;20091, 20091, 35468, 35468
+; Constant Pool
+_CONSTANTS_ DCD cospi8sqrt2minus1
+cospi8sqrt2minus1 DCD 0x4e7b4e7b
+sinpi8sqrt2 DCD 0x8a8c8a8c
END
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index a5850db..f4c6be9 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -126,7 +126,6 @@
}
}
-
static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd)
{
/* If the MV points so far into the UMV border that no visible pixels
@@ -182,110 +181,6 @@
}
-static void reconstruct_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
-{
- if (xd->frame_type == KEY_FRAME || xd->mbmi.ref_frame == INTRA_FRAME)
- {
- vp8_build_intra_predictors_mbuv(xd);
-
- if (xd->mbmi.mode != B_PRED)
- {
- vp8_build_intra_predictors_mby_ptr(xd);
- vp8_recon16x16mb(RTCD_VTABLE(recon), xd);
- }
- else
- {
- vp8_recon_intra4x4mb(RTCD_VTABLE(recon), xd);
- }
- }
- else
- {
- vp8_build_inter_predictors_mb(xd);
- vp8_recon16x16mb(RTCD_VTABLE(recon), xd);
- }
-}
-
-
-static void de_quantand_idct(VP8D_COMP *pbi, MACROBLOCKD *xd)
-{
- int i;
- BLOCKD *b = &xd->block[24];
-
-
- if (xd->mbmi.mode != B_PRED && xd->mbmi.mode != SPLITMV)
- {
- DEQUANT_INVOKE(&pbi->dequant, block)(b);
-
- // do 2nd order transform on the dc block
- if (b->eob > 1)
- {
- IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
- ((int *)b->qcoeff)[0] = 0;
- ((int *)b->qcoeff)[1] = 0;
- ((int *)b->qcoeff)[2] = 0;
- ((int *)b->qcoeff)[3] = 0;
- ((int *)b->qcoeff)[4] = 0;
- ((int *)b->qcoeff)[5] = 0;
- ((int *)b->qcoeff)[6] = 0;
- ((int *)b->qcoeff)[7] = 0;
- }
- else
- {
- IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
- ((int *)b->qcoeff)[0] = 0;
- }
-
-
- for (i = 0; i < 16; i++)
- {
-
- b = &xd->block[i];
-
- if (b->eob > 1)
- {
- DEQUANT_INVOKE(&pbi->dequant, idct_dc)(b->qcoeff, &b->dequant[0][0], b->diff, 32, xd->block[24].diff[i]);
- }
- else
- {
- IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar)(xd->block[24].diff[i], b->diff, 32);
- }
- }
-
- for (i = 16; i < 24; i++)
- {
- b = &xd->block[i];
-
- if (b->eob > 1)
- {
- DEQUANT_INVOKE(&pbi->dequant, idct)(b->qcoeff, &b->dequant[0][0], b->diff, 16);
- }
- else
- {
- IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar)(b->qcoeff[0] * b->dequant[0][0], b->diff, 16);
- ((int *)b->qcoeff)[0] = 0;
- }
- }
- }
- else
- {
- for (i = 0; i < 24; i++)
- {
-
- b = &xd->block[i];
-
- if (b->eob > 1)
- {
- DEQUANT_INVOKE(&pbi->dequant, idct)(b->qcoeff, &b->dequant[0][0], b->diff, (32 - (i & 16)));
- }
- else
- {
- IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar)(b->qcoeff[0] * b->dequant[0][0], b->diff, (32 - (i & 16)));
- ((int *)b->qcoeff)[0] = 0;
- }
- }
- }
-}
-
void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
{
int eobtotal = 0;
@@ -321,27 +216,124 @@
{
xd->mode_info_context->mbmi.dc_diff = 0;
skip_recon_mb(pbi, xd);
+ return;
+ }
+
+ if (xd->segmentation_enabled)
+ mb_init_dequantizer(pbi, xd);
+
+ // do prediction
+ if (xd->frame_type == KEY_FRAME || xd->mbmi.ref_frame == INTRA_FRAME)
+ {
+ vp8_build_intra_predictors_mbuv(xd);
+
+ if (xd->mbmi.mode != B_PRED)
+ {
+ vp8_build_intra_predictors_mby_ptr(xd);
+ } else {
+ vp8_intra_prediction_down_copy(xd);
+ }
}
else
{
- if (xd->segmentation_enabled)
- mb_init_dequantizer(pbi, xd);
-
- de_quantand_idct(pbi, xd);
- reconstruct_mb(pbi, xd);
+ vp8_build_inter_predictors_mb(xd);
}
-
- /* Restore the original MV so as not to affect the entropy context. */
- if (do_clamp)
+ // dequantization and idct
+ if (xd->mbmi.mode != B_PRED && xd->mbmi.mode != SPLITMV)
{
- if (xd->mbmi.mode == SPLITMV)
- for (i=0; i<24; i++)
- xd->block[i].bmi.mv.as_mv = orig_mvs[i];
+ BLOCKD *b = &xd->block[24];
+ DEQUANT_INVOKE(&pbi->dequant, block)(b);
+
+ // do 2nd order transform on the dc block
+ if (b->eob > 1)
+ {
+ IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
+ ((int *)b->qcoeff)[0] = 0;
+ ((int *)b->qcoeff)[1] = 0;
+ ((int *)b->qcoeff)[2] = 0;
+ ((int *)b->qcoeff)[3] = 0;
+ ((int *)b->qcoeff)[4] = 0;
+ ((int *)b->qcoeff)[5] = 0;
+ ((int *)b->qcoeff)[6] = 0;
+ ((int *)b->qcoeff)[7] = 0;
+ }
else
{
- xd->mbmi.mv.as_mv = orig_mvs[0];
- xd->block[16].bmi.mv.as_mv = orig_mvs[1];
+ IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
+ ((int *)b->qcoeff)[0] = 0;
+ }
+
+
+ for (i = 0; i < 16; i++)
+ {
+
+ b = &xd->block[i];
+
+ if (b->eob > 1)
+ {
+ DEQUANT_INVOKE(&pbi->dequant, dc_idct_add)
+ (b->qcoeff, &b->dequant[0][0], b->predictor,
+ *(b->base_dst) + b->dst, 16, b->dst_stride,
+ xd->block[24].diff[i]);
+ }
+ else
+ {
+ IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(xd->block[24].diff[i], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
+ }
+ }
+ }
+ else if ((xd->frame_type == KEY_FRAME || xd->mbmi.ref_frame == INTRA_FRAME) && xd->mbmi.mode == B_PRED)
+ {
+ for (i = 0; i < 16; i++)
+ {
+
+ BLOCKD *b = &xd->block[i];
+ vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+
+ if (b->eob > 1)
+ {
+ DEQUANT_INVOKE(&pbi->dequant, idct_add)(b->qcoeff, &b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
+ }
+ else
+ {
+ IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(b->qcoeff[0] * b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
+ ((int *)b->qcoeff)[0] = 0;
+ }
+ }
+
+ }
+ else
+ {
+ for (i = 0; i < 16; i++)
+ {
+ BLOCKD *b = &xd->block[i];
+
+ if (b->eob > 1)
+ {
+ DEQUANT_INVOKE(&pbi->dequant, idct_add)(b->qcoeff, &b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
+ }
+ else
+ {
+ IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(b->qcoeff[0] * b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
+ ((int *)b->qcoeff)[0] = 0;
+ }
+ }
+ }
+
+ for (i = 16; i < 24; i++)
+ {
+
+ BLOCKD *b = &xd->block[i];
+
+ if (b->eob > 1)
+ {
+ DEQUANT_INVOKE(&pbi->dequant, idct_add)(b->qcoeff, &b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 8, b->dst_stride);
+ }
+ else
+ {
+ IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(b->qcoeff[0] * b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 8, b->dst_stride);
+ ((int *)b->qcoeff)[0] = 0;
}
}
}
diff --git a/vp8/decoder/dequantize.c b/vp8/decoder/dequantize.c
index b023d28..df7cf5f 100644
--- a/vp8/decoder/dequantize.c
+++ b/vp8/decoder/dequantize.c
@@ -32,8 +32,12 @@
}
}
-void vp8_dequant_idct_c(short *input, short *dq, short *output, int pitch)
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
+ unsigned char *dest, int pitch, int stride)
{
+ short output[16];
+ short *diff_ptr = output;
+ int r, c;
int i;
for (i = 0; i < 16; i++)
@@ -41,13 +45,40 @@
input[i] = dq[i] * input[i];
}
- vp8_short_idct4x4llm_c(input, output, pitch);
+ // the idct halves ( >> 1) the pitch
+ vp8_short_idct4x4llm_c(input, output, 4 << 1);
+
vpx_memset(input, 0, 32);
+
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ int a = diff_ptr[c] + pred[c];
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dest[c] = (unsigned char) a;
+ }
+
+ dest += stride;
+ diff_ptr += 4;
+ pred += pitch;
+ }
}
-void vp8_dequant_dc_idct_c(short *input, short *dq, short *output, int pitch, int Dc)
+void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
+ unsigned char *dest, int pitch, int stride,
+ int Dc)
{
int i;
+ short output[16];
+ short *diff_ptr = output;
+ int r, c;
input[0] = (short)Dc;
@@ -56,6 +87,28 @@
input[i] = dq[i] * input[i];
}
- vp8_short_idct4x4llm_c(input, output, pitch);
+ // the idct halves ( >> 1) the pitch
+ vp8_short_idct4x4llm_c(input, output, 4 << 1);
+
vpx_memset(input, 0, 32);
+
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ int a = diff_ptr[c] + pred[c];
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dest[c] = (unsigned char) a;
+ }
+
+ dest += stride;
+ diff_ptr += 4;
+ pred += pitch;
+ }
}
diff --git a/vp8/decoder/dequantize.h b/vp8/decoder/dequantize.h
index a02ea7e..fbca391 100644
--- a/vp8/decoder/dequantize.h
+++ b/vp8/decoder/dequantize.h
@@ -16,11 +16,16 @@
#define prototype_dequant_block(sym) \
void sym(BLOCKD *x)
-#define prototype_dequant_idct(sym) \
- void sym(short *input, short *dq, short *output, int pitch)
+#define prototype_dequant_idct_add(sym) \
+ void sym(short *input, short *dq, \
+ unsigned char *pred, unsigned char *output, \
+ int pitch, int stride)
-#define prototype_dequant_idct_dc(sym) \
- void sym(short *input, short *dq, short *output, int pitch, int dc)
+#define prototype_dequant_dc_idct_add(sym) \
+ void sym(short *input, short *dq, \
+ unsigned char *pred, unsigned char *output, \
+ int pitch, int stride, \
+ int dc)
#if ARCH_X86 || ARCH_X86_64
#include "x86/dequantize_x86.h"
@@ -35,25 +40,26 @@
#endif
extern prototype_dequant_block(vp8_dequant_block);
-#ifndef vp8_dequant_idct
-#define vp8_dequant_idct vp8_dequant_idct_c
+#ifndef vp8_dequant_idct_add
+#define vp8_dequant_idct_add vp8_dequant_idct_add_c
#endif
-extern prototype_dequant_idct(vp8_dequant_idct);
+extern prototype_dequant_idct_add(vp8_dequant_idct_add);
-#ifndef vp8_dequant_idct_dc
-#define vp8_dequant_idct_dc vp8_dequant_dc_idct_c
+#ifndef vp8_dequant_dc_idct_add
+#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_c
#endif
-extern prototype_dequant_idct_dc(vp8_dequant_idct_dc);
-
+extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add);
typedef prototype_dequant_block((*vp8_dequant_block_fn_t));
-typedef prototype_dequant_idct((*vp8_dequant_idct_fn_t));
-typedef prototype_dequant_idct_dc((*vp8_dequant_idct_dc_fn_t));
+
+typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t));
+typedef prototype_dequant_dc_idct_add((*vp8_dequant_dc_idct_add_fn_t));
+
typedef struct
{
- vp8_dequant_block_fn_t block;
- vp8_dequant_idct_fn_t idct;
- vp8_dequant_idct_dc_fn_t idct_dc;
+ vp8_dequant_block_fn_t block;
+ vp8_dequant_idct_add_fn_t idct_add;
+ vp8_dequant_dc_idct_add_fn_t dc_idct_add;
} vp8_dequant_rtcd_vtable_t;
#if CONFIG_RUNTIME_CPU_DETECT
diff --git a/vp8/decoder/generic/dsystemdependent.c b/vp8/decoder/generic/dsystemdependent.c
index 272d422..ab085e23 100644
--- a/vp8/decoder/generic/dsystemdependent.c
+++ b/vp8/decoder/generic/dsystemdependent.c
@@ -21,8 +21,8 @@
#if CONFIG_RUNTIME_CPU_DETECT
pbi->mb.rtcd = &pbi->common.rtcd;
pbi->dequant.block = vp8_dequantize_b_c;
- pbi->dequant.idct = vp8_dequant_idct_c;
- pbi->dequant.idct_dc = vp8_dequant_dc_idct_c;
+ pbi->dequant.idct_add = vp8_dequant_idct_add_c;
+ pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_c;
pbi->dboolhuff.start = vp8dx_start_decode_c;
pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c;
#if 0 //For use with RTCD, when implemented
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 752081e..8f4e9da 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -12,6 +12,9 @@
#ifndef WIN32
# include <unistd.h>
#endif
+#ifdef __APPLE__
+#include <mach/mach_init.h>
+#endif
#include "onyxd_int.h"
#include "vpx_mem/vpx_mem.h"
#include "threading.h"
diff --git a/vp8/decoder/x86/dequantize_mmx.asm b/vp8/decoder/x86/dequantize_mmx.asm
index 2dcf548..f11eef3 100644
--- a/vp8/decoder/x86/dequantize_mmx.asm
+++ b/vp8/decoder/x86/dequantize_mmx.asm
@@ -50,12 +50,12 @@
ret
-;void dequant_idct_mmx(short *input, short *dq, short *output, int pitch)
-global sym(vp8_dequant_idct_mmx)
-sym(vp8_dequant_idct_mmx):
+;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
+global sym(vp8_dequant_idct_add_mmx)
+sym(vp8_dequant_idct_add_mmx):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
+ SHADOW_ARGS_TO_STACK 6
GET_GOT rbx
push rsi
push rdi
@@ -77,7 +77,8 @@
movq mm3, [rax+24]
pmullw mm3, [rdx+24]
- mov rdx, arg(2) ;output
+ mov rdx, arg(3) ;dest
+ mov rsi, arg(2) ;pred
pxor mm7, mm7
@@ -88,7 +89,8 @@
movq [rax+24],mm7
- movsxd rax, dword ptr arg(3) ;pitch
+ movsxd rax, dword ptr arg(4) ;pitch
+ movsxd rdi, dword ptr arg(5) ;stride
psubw mm0, mm2 ; b1= 0-2
paddw mm2, mm2 ;
@@ -207,13 +209,34 @@
punpckldq mm2, mm4 ; 32 22 12 02
punpckhdq mm5, mm4 ; 33 23 13 03
- movq [rdx], mm0
+ pxor mm7, mm7
- movq [rdx+rax], mm1
- movq [rdx+rax*2], mm2
+ movd mm4, [rsi]
+ punpcklbw mm4, mm7
+ paddsw mm0, mm4
+ packuswb mm0, mm7
+ movd [rdx], mm0
- add rdx, rax
- movq [rdx+rax*2], mm5
+ movd mm4, [rsi+rax]
+ punpcklbw mm4, mm7
+ paddsw mm1, mm4
+ packuswb mm1, mm7
+ movd [rdx+rdi], mm1
+
+ movd mm4, [rsi+2*rax]
+ punpcklbw mm4, mm7
+ paddsw mm2, mm4
+ packuswb mm2, mm7
+ movd [rdx+rdi*2], mm2
+
+ add rdx, rdi
+ add rsi, rax
+
+ movd mm4, [rsi+2*rax]
+ punpcklbw mm4, mm7
+ paddsw mm5, mm4
+ packuswb mm5, mm7
+ movd [rdx+rdi*2], mm5
; begin epilog
pop rdi
@@ -224,12 +247,12 @@
ret
-;void dequant_dc_idct_mmx(short *input, short *dq, short *output, int pitch, int Dc)
-global sym(vp8_dequant_dc_idct_mmx)
-sym(vp8_dequant_dc_idct_mmx):
+;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
+global sym(vp8_dequant_dc_idct_add_mmx)
+sym(vp8_dequant_dc_idct_add_mmx):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
+ SHADOW_ARGS_TO_STACK 7
GET_GOT rbx
push rsi
push rdi
@@ -238,8 +261,6 @@
mov rax, arg(0) ;input
mov rdx, arg(1) ;dq
- movsxd rcx, dword ptr arg(4) ;Dc
-
movq mm0, [rax ]
pmullw mm0, [rdx]
@@ -252,7 +273,8 @@
movq mm3, [rax+24]
pmullw mm3, [rdx+24]
- mov rdx, arg(2) ;output
+ mov rdx, arg(3) ;dest
+ mov rsi, arg(2) ;pred
pxor mm7, mm7
@@ -262,8 +284,15 @@
movq [rax+16],mm7
movq [rax+24],mm7
- pinsrw mm0, rcx, 0
- movsxd rax, dword ptr arg(3) ;pitch
+ ; move lower word of Dc to lower word of mm0
+ psrlq mm0, 16
+ movzx rcx, word ptr arg(6) ;Dc
+ psllq mm0, 16
+ movd mm7, rcx
+ por mm0, mm7
+
+ movsxd rax, dword ptr arg(4) ;pitch
+ movsxd rdi, dword ptr arg(5) ;stride
psubw mm0, mm2 ; b1= 0-2
paddw mm2, mm2 ;
@@ -382,13 +411,34 @@
punpckldq mm2, mm4 ; 32 22 12 02
punpckhdq mm5, mm4 ; 33 23 13 03
- movq [rdx], mm0
+ pxor mm7, mm7
- movq [rdx+rax], mm1
- movq [rdx+rax*2], mm2
+ movd mm4, [rsi]
+ punpcklbw mm4, mm7
+ paddsw mm0, mm4
+ packuswb mm0, mm7
+ movd [rdx], mm0
- add rdx, rax
- movq [rdx+rax*2], mm5
+ movd mm4, [rsi+rax]
+ punpcklbw mm4, mm7
+ paddsw mm1, mm4
+ packuswb mm1, mm7
+ movd [rdx+rdi], mm1
+
+ movd mm4, [rsi+2*rax]
+ punpcklbw mm4, mm7
+ paddsw mm2, mm4
+ packuswb mm2, mm7
+ movd [rdx+rdi*2], mm2
+
+ add rdx, rdi
+ add rsi, rax
+
+ movd mm4, [rsi+2*rax]
+ punpcklbw mm4, mm7
+ paddsw mm5, mm4
+ packuswb mm5, mm7
+ movd [rdx+rdi*2], mm5
; begin epilog
pop rdi
diff --git a/vp8/decoder/x86/dequantize_x86.h b/vp8/decoder/x86/dequantize_x86.h
index 743a8f5..4492676 100644
--- a/vp8/decoder/x86/dequantize_x86.h
+++ b/vp8/decoder/x86/dequantize_x86.h
@@ -21,19 +21,19 @@
*/
#if HAVE_MMX
extern prototype_dequant_block(vp8_dequantize_b_mmx);
-extern prototype_dequant_idct(vp8_dequant_idct_mmx);
-extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_mmx);
+extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx);
+extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_dequant_block
#define vp8_dequant_block vp8_dequantize_b_mmx
-#undef vp8_dequant_idct
-#define vp8_dequant_idct vp8_dequant_idct_mmx
+#undef vp8_dequant_idct_add
+#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
-#undef vp8_dequant_idct_dc
-#define vp8_dequant_idct_dc vp8_dequant_dc_idct_mmx
+#undef vp8_dequant_dc_idct_add
+#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_mmx
#endif
#endif
diff --git a/vp8/decoder/x86/x86_dsystemdependent.c b/vp8/decoder/x86/x86_dsystemdependent.c
index 957fb1d..7891051 100644
--- a/vp8/decoder/x86/x86_dsystemdependent.c
+++ b/vp8/decoder/x86/x86_dsystemdependent.c
@@ -43,8 +43,8 @@
if (flags & HAS_MMX)
{
pbi->dequant.block = vp8_dequantize_b_mmx;
- pbi->dequant.idct = vp8_dequant_idct_mmx;
- pbi->dequant.idct_dc = vp8_dequant_dc_idct_mmx;
+ pbi->dequant.idct_add = vp8_dequant_idct_add_mmx;
+ pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_mmx;
}
#endif
diff --git a/vp8/encoder/arm/csystemdependent.c b/vp8/encoder/arm/csystemdependent.c
index bfceab1..698cf1e 100644
--- a/vp8/encoder/arm/csystemdependent.c
+++ b/vp8/encoder/arm/csystemdependent.c
@@ -63,6 +63,10 @@
cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_neon;
cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;
+ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_c;
+ /* The neon quantizer has not been updated to match the new exact
+ * quantizer introduced in commit e04e2935
+ */
/*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_neon;*/
#elif HAVE_ARMV6
cpi->rtcd.variance.sad16x16 = vp8_sad16x16_c;
diff --git a/vp8/encoder/arm/quantize_arm.h b/vp8/encoder/arm/quantize_arm.h
index 5a7b0ca..339b8a2 100644
--- a/vp8/encoder/arm/quantize_arm.h
+++ b/vp8/encoder/arm/quantize_arm.h
@@ -15,8 +15,11 @@
#if HAVE_ARMV7
extern prototype_quantize_block(vp8_fast_quantize_b_neon);
-#undef vp8_quantize_fastquantb
-#define vp8_quantize_fastquantb vp8_fast_quantize_b_neon
+/* The neon quantizer has not been updated to match the new exact
+ * quantizer introduced in commit e04e2935
+ */
+//#undef vp8_quantize_fastquantb
+//#define vp8_quantize_fastquantb vp8_fast_quantize_b_neon
#endif
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index cb9a8dd..4e4483e 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -102,7 +102,8 @@
80, 80, 80, 80, 80, 80, 80, 80,
80,
};
-
+//#define EXACT_QUANT
+#ifdef EXACT_QUANT
static void vp8cx_invert_quant(short *quant, short *shift, short d)
{
unsigned t;
@@ -184,7 +185,71 @@
}
}
}
-
+#else
+void vp8cx_init_quantizer(VP8_COMP *cpi)
+{
+ int r, c;
+ int i;
+ int quant_val;
+ int Q;
+
+ int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44, 44, 44};
+
+ for (Q = 0; Q < QINDEX_RANGE; Q++)
+ {
+ // dc values
+ quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
+ cpi->Y1quant[Q][0][0] = (1 << 16) / quant_val;
+ cpi->Y1zbin[Q][0][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+ cpi->Y1round[Q][0][0] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.Y1dequant[Q][0][0] = quant_val;
+ cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+ quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
+ cpi->Y2quant[Q][0][0] = (1 << 16) / quant_val;
+ cpi->Y2zbin[Q][0][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+ cpi->Y2round[Q][0][0] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.Y2dequant[Q][0][0] = quant_val;
+ cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+ quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
+ cpi->UVquant[Q][0][0] = (1 << 16) / quant_val;
+ cpi->UVzbin[Q][0][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
+ cpi->UVround[Q][0][0] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.UVdequant[Q][0][0] = quant_val;
+ cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+ // all the ac values = ;
+ for (i = 1; i < 16; i++)
+ {
+ int rc = vp8_default_zig_zag1d[i];
+ r = (rc >> 2);
+ c = (rc & 3);
+
+ quant_val = vp8_ac_yquant(Q);
+ cpi->Y1quant[Q][r][c] = (1 << 16) / quant_val;
+ cpi->Y1zbin[Q][r][c] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+ cpi->Y1round[Q][r][c] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.Y1dequant[Q][r][c] = quant_val;
+ cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+
+ quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
+ cpi->Y2quant[Q][r][c] = (1 << 16) / quant_val;
+ cpi->Y2zbin[Q][r][c] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+ cpi->Y2round[Q][r][c] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.Y2dequant[Q][r][c] = quant_val;
+ cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+
+ quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+ cpi->UVquant[Q][r][c] = (1 << 16) / quant_val;
+ cpi->UVzbin[Q][r][c] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+ cpi->UVround[Q][r][c] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.UVdequant[Q][r][c] = quant_val;
+ cpi->zrun_zbin_boost_uv[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+ }
+ }
+}
+#endif
void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
{
int i;
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index 877002b..7b44724 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -16,6 +16,8 @@
#include "entropy.h"
#include "predictdc.h"
+//#define EXACT_QUANT
+#ifdef EXACT_QUANT
void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
{
int i, rc, eob;
@@ -116,7 +118,103 @@
d->eob = eob + 1;
}
-
+#else
+void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
+{
+ int i, rc, eob;
+ int zbin;
+ int x, y, z, sz;
+ short *coeff_ptr = &b->coeff[0];
+ short *zbin_ptr = &b->zbin[0][0];
+ short *round_ptr = &b->round[0][0];
+ short *quant_ptr = &b->quant[0][0];
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = d->dqcoeff;
+ short *dequant_ptr = &d->dequant[0][0];
+
+ vpx_memset(qcoeff_ptr, 0, 32);
+ vpx_memset(dqcoeff_ptr, 0, 32);
+
+ eob = -1;
+
+ for (i = 0; i < 16; i++)
+ {
+ rc = vp8_default_zig_zag1d[i];
+ z = coeff_ptr[rc];
+ zbin = zbin_ptr[rc] ;
+
+ sz = (z >> 31); // sign of z
+ x = (z ^ sz) - sz; // x = abs(z)
+
+ if (x >= zbin)
+ {
+ y = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
+ x = (y ^ sz) - sz; // get the sign back
+ qcoeff_ptr[rc] = x; // write to destination
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value
+
+ if (y)
+ {
+ eob = i; // last nonzero coeffs
+ }
+ }
+ }
+ d->eob = eob + 1;
+}
+
+void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
+{
+ int i, rc, eob;
+ int zbin;
+ int x, y, z, sz;
+ short *zbin_boost_ptr = &b->zrun_zbin_boost[0];
+ short *coeff_ptr = &b->coeff[0];
+ short *zbin_ptr = &b->zbin[0][0];
+ short *round_ptr = &b->round[0][0];
+ short *quant_ptr = &b->quant[0][0];
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = d->dqcoeff;
+ short *dequant_ptr = &d->dequant[0][0];
+ short zbin_oq_value = b->zbin_extra;
+
+ vpx_memset(qcoeff_ptr, 0, 32);
+ vpx_memset(dqcoeff_ptr, 0, 32);
+
+ eob = -1;
+
+ for (i = 0; i < 16; i++)
+ {
+ rc = vp8_default_zig_zag1d[i];
+ z = coeff_ptr[rc];
+
+ //if ( i == 0 )
+ // zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value/2;
+ //else
+ zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
+
+ zbin_boost_ptr ++;
+ sz = (z >> 31); // sign of z
+ x = (z ^ sz) - sz; // x = abs(z)
+
+ if (x >= zbin)
+ {
+ y = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
+ x = (y ^ sz) - sz; // get the sign back
+ qcoeff_ptr[rc] = x; // write to destination
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value
+
+ if (y)
+ {
+ eob = i; // last nonzero coeffs
+ zbin_boost_ptr = &b->zrun_zbin_boost[0]; // reset zero runlength
+ }
+ }
+ }
+
+ d->eob = eob + 1;
+}
+
+#endif
void vp8_quantize_mby(MACROBLOCK *x)
{
int i;
diff --git a/vp8/encoder/x86/csystemdependent.c b/vp8/encoder/x86/csystemdependent.c
index bf12fee..4bb6b60 100644
--- a/vp8/encoder/x86/csystemdependent.c
+++ b/vp8/encoder/x86/csystemdependent.c
@@ -180,7 +180,10 @@
{
// Willamette instruction set available:
vp8_mbuverror = vp8_mbuverror_xmm;
- vp8_fast_quantize_b = vp8_fast_quantize_b_sse;
+ /* The sse quantizer has not been updated to match the new exact
+ * quantizer introduced in commit e04e2935
+ */
+ vp8_fast_quantize_b = vp8_fast_quantize_b_c;
#if 0 //new fdct
vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx;
vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx;
@@ -224,7 +227,10 @@
{
// MMX instruction set available:
vp8_mbuverror = vp8_mbuverror_mmx;
- vp8_fast_quantize_b = vp8_fast_quantize_b_mmx;
+ /* The mmx quantizer has not been updated to match the new exact
+ * quantizer introduced in commit e04e2935
+ */
+ vp8_fast_quantize_b = vp8_fast_quantize_b_c;
#if 0 // new fdct
vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx;
vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx;
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index c64a8ba..faaff64 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -42,8 +42,8 @@
sub rsp, vp8_regularquantizeb_stack_size
- movdqa DQWORD PTR[rsp + save_xmm6], xmm6
- movdqa DQWORD PTR[rsp + save_xmm7], xmm7
+ movdqa OWORD PTR[rsp + save_xmm6], xmm6
+ movdqa OWORD PTR[rsp + save_xmm7], xmm7
mov rdx, arg(0) ;coeff_ptr
mov eax, arg(8) ;zbin_oq_value
@@ -51,8 +51,8 @@
mov rcx, arg(1) ;zbin_ptr
movd xmm7, eax
- movdqa xmm0, DQWORD PTR[rdx]
- movdqa xmm4, DQWORD PTR[rdx + 16]
+ movdqa xmm0, OWORD PTR[rdx]
+ movdqa xmm4, OWORD PTR[rdx + 16]
movdqa xmm1, xmm0
movdqa xmm5, xmm4
@@ -63,8 +63,8 @@
pxor xmm1, xmm0
pxor xmm5, xmm4
- movdqa xmm2, DQWORD PTR[rcx] ;load zbin_ptr
- movdqa xmm3, DQWORD PTR[rcx + 16] ;load zbin_ptr
+ movdqa xmm2, OWORD PTR[rcx] ;load zbin_ptr
+ movdqa xmm3, OWORD PTR[rcx + 16] ;load zbin_ptr
pshuflw xmm7, xmm7, 0
psubw xmm1, xmm0 ;x = abs(z)
@@ -81,17 +81,17 @@
mov rdi, arg(5) ;round_ptr
mov rsi, arg(6) ;quant_ptr
- movdqa DQWORD PTR[rsp + abs_minus_zbin_lo], xmm1
- movdqa DQWORD PTR[rsp + abs_minus_zbin_hi], xmm5
+ movdqa OWORD PTR[rsp + abs_minus_zbin_lo], xmm1
+ movdqa OWORD PTR[rsp + abs_minus_zbin_hi], xmm5
paddw xmm1, xmm2 ;add (zbin_ptr + zbin_oq_value) back
paddw xmm5, xmm3 ;add (zbin_ptr + zbin_oq_value) back
- movdqa xmm2, DQWORD PTR[rdi]
- movdqa xmm3, DQWORD PTR[rsi]
+ movdqa xmm2, OWORD PTR[rdi]
+ movdqa xmm3, OWORD PTR[rsi]
- movdqa xmm6, DQWORD PTR[rdi + 16]
- movdqa xmm7, DQWORD PTR[rsi + 16]
+ movdqa xmm6, OWORD PTR[rdi + 16]
+ movdqa xmm7, OWORD PTR[rsi + 16]
paddw xmm1, xmm2
paddw xmm5, xmm6
@@ -108,11 +108,11 @@
psubw xmm1, xmm0
psubw xmm5, xmm4
- movdqa DQWORD PTR[rsp + temp_qcoeff_lo], xmm1
- movdqa DQWORD PTR[rsp + temp_qcoeff_hi], xmm5
+ movdqa OWORD PTR[rsp + temp_qcoeff_lo], xmm1
+ movdqa OWORD PTR[rsp + temp_qcoeff_hi], xmm5
- movdqa DQWORD PTR[rsi], xmm6 ;zero qcoeff
- movdqa DQWORD PTR[rsi + 16], xmm6 ;zero qcoeff
+ movdqa OWORD PTR[rsi], xmm6 ;zero qcoeff
+ movdqa OWORD PTR[rsi + 16], xmm6 ;zero qcoeff
xor rax, rax
mov rcx, -1
@@ -223,22 +223,22 @@
mov rcx, arg(3) ;dequant_ptr
mov rsi, arg(7) ;dqcoeff_ptr
- movdqa xmm2, DQWORD PTR[rdi]
- movdqa xmm3, DQWORD PTR[rdi + 16]
+ movdqa xmm2, OWORD PTR[rdi]
+ movdqa xmm3, OWORD PTR[rdi + 16]
- movdqa xmm0, DQWORD PTR[rcx]
- movdqa xmm1, DQWORD PTR[rcx + 16]
+ movdqa xmm0, OWORD PTR[rcx]
+ movdqa xmm1, OWORD PTR[rcx + 16]
pmullw xmm0, xmm2
pmullw xmm1, xmm3
- movdqa DQWORD PTR[rsi], xmm0 ;store dqcoeff
- movdqa DQWORD PTR[rsi + 16], xmm1 ;store dqcoeff
+ movdqa OWORD PTR[rsi], xmm0 ;store dqcoeff
+ movdqa OWORD PTR[rsi + 16], xmm1 ;store dqcoeff
mov rax, [rsp + eob]
- movdqa xmm6, DQWORD PTR[rsp + save_xmm6]
- movdqa xmm7, DQWORD PTR[rsp + save_xmm7]
+ movdqa xmm6, OWORD PTR[rsp + save_xmm6]
+ movdqa xmm7, OWORD PTR[rsp + save_xmm7]
add rax, 1
diff --git a/vp8/encoder/x86/quantize_x86.h b/vp8/encoder/x86/quantize_x86.h
index 37d69a8..31a43e4 100644
--- a/vp8/encoder/x86/quantize_x86.h
+++ b/vp8/encoder/x86/quantize_x86.h
@@ -27,8 +27,11 @@
#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_quantize_quantb
-#define vp8_quantize_quantb vp8_regular_quantize_b_sse2
+/* The sse2 quantizer has not been updated to match the new exact
+ * quantizer introduced in commit e04e2935
+ *#undef vp8_quantize_quantb
+ *#define vp8_quantize_quantb vp8_regular_quantize_b_sse2
+ */
#endif
diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm
index 5775d98..c381ffd 100644
--- a/vp8/encoder/x86/subtract_mmx.asm
+++ b/vp8/encoder/x86/subtract_mmx.asm
@@ -15,7 +15,7 @@
; unsigned short *diff, unsigned char *Predictor,
; int pitch);
global sym(vp8_subtract_b_mmx_impl)
-sym(vp8_subtract_b_mmx_impl)
+sym(vp8_subtract_b_mmx_impl):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
@@ -151,7 +151,7 @@
;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
global sym(vp8_subtract_mbuv_mmx)
-sym(vp8_subtract_mbuv_mmx)
+sym(vp8_subtract_mbuv_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index d993927..5b8a301 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -125,6 +125,7 @@
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/vpx_asm_offsets.c
VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/filter_c.c
+VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/idctllm.c
VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/recon.c
VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/reconintra4x4.c
VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/generic/systemdependent.c
@@ -134,6 +135,7 @@
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x4_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x8_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem16x16_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/dc_only_idct_add_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/iwalsh_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/filter_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/idct_v6$(ASM)
@@ -150,6 +152,7 @@
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem8x4_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem8x8_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem16x16_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/dc_only_idct_add_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/iwalsh_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index c88df47..c0a58ae 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -87,6 +87,7 @@
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/dct_x86.h
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/mcomp_x86.h
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/variance_x86.h
+VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_x86.h
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/x86_csystemdependent.c
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_mmx.c
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_impl_mmx.asm
diff --git a/vp8/vp8dx_arm.mk b/vp8/vp8dx_arm.mk
index e741680..e9674ca 100644
--- a/vp8/vp8dx_arm.mk
+++ b/vp8/vp8dx_arm.mk
@@ -23,12 +23,12 @@
#File list for armv6
# decoder
-VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantdcidct_v6$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantidct_v6$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_idct_v6$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantize_v6$(ASM)
#File list for neon
# decoder
-VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantdcidct_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantidct_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_dc_idct_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_idct_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantizeb_neon$(ASM)
diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm
index 5748c23..eff9926 100644
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm
@@ -138,16 +138,16 @@
%endmacro
%endif
%endif
- %define HIDDEN_DATA
+ %define HIDDEN_DATA(x) x
%else
%macro GET_GOT 1
%endmacro
%define GLOBAL wrt rip
%ifidn __OUTPUT_FORMAT__,elf64
%define WRT_PLT wrt ..plt
- %define HIDDEN_DATA :data hidden
+ %define HIDDEN_DATA(x) x:data hidden
%else
- %define HIDDEN_DATA
+ %define HIDDEN_DATA(x) x
%endif
%endif
%ifnmacro GET_GOT