framework for assembly version of the detokenizer
adds a compile time option: --enable-arm-asm-detok which pulls in
vp8/decoder/arm/detokenize.asm
currently about break even speed wise, but changes are pending to
the fill code (branch and load 3 bytes versus conditionally always
load one) and the error handling. Currently it doesn't handle zero
runs or overrunning the buffer.
this is really just so i don't have to rebase my changes all the
time to run benchmarks - now just need to replace one file!
Change-Id: I56d0e2354dc0ca3811bffd0e88fe1f952fa6c797
diff --git a/configure b/configure
index 5c908d4..ac3d162 100755
--- a/configure
+++ b/configure
@@ -38,6 +38,7 @@
${toggle_realtime_only} enable this option while building for real-time encoding
${toggle_runtime_cpu_detect} runtime cpu detection
${toggle_shared} shared library support
+ ${toggle_arm_asm_detok} assembly version of the detokenizer (ARM platforms only)
Codecs:
Codecs can be selectively enabled or disabled individually, or by family:
@@ -242,6 +243,7 @@
spatial_resampling
realtime_only
shared
+ arm_asm_detok
"
CMDLINE_SELECT="
extra_warnings
@@ -278,6 +280,7 @@
spatial_resampling
realtime_only
shared
+ arm_asm_detok
"
process_cmdline() {
diff --git a/vp8/decoder/arm/detokenize.asm b/vp8/decoder/arm/detokenize.asm
new file mode 100644
index 0000000..bafacb9
--- /dev/null
+++ b/vp8/decoder/arm/detokenize.asm
@@ -0,0 +1,333 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_decode_mb_tokens_v6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+ INCLUDE vpx_asm_offsets.asm
+
+l_qcoeff EQU 0
+l_i EQU 4
+l_type EQU 8
+l_stop EQU 12
+l_c EQU 16
+l_l_ptr EQU 20
+l_a_ptr EQU 24
+l_bc EQU 28
+l_coef_ptr EQU 32
+l_stacksize EQU 64
+
+
+;; constant offsets -- these should be created at build time
+c_onyxblock2left_offset EQU 25
+c_onyxblock2above_offset EQU 50
+c_entropy_nodes EQU 11
+c_dct_eob_token EQU 11
+
+|vp8_decode_mb_tokens_v6| PROC
+ stmdb sp!, {r4 - r11, lr}
+ sub sp, sp, #l_stacksize
+ mov r7, r1 ; type
+ mov r9, r0 ; detoken
+
+ ldr r1, [r9, #detok_current_bc]
+ ldr r0, [r9, #detok_qcoeff_start_ptr]
+ mov r11, #0 ; i
+ mov r3, #0x10 ; stop
+
+ cmp r7, #1 ; type ?= 1
+ addeq r11, r11, #24 ; i = 24
+ addeq r3, r3, #8 ; stop = 24
+ addeq r0, r0, #3, 24 ; qcoefptr += 24*16 ?CHECKME
+
+ str r0, [sp, #l_qcoeff]
+ str r11, [sp, #l_i]
+ str r7, [sp, #l_type]
+ str r3, [sp, #l_stop]
+ str r1, [sp, #l_bc]
+
+ add lr, r9, r7, lsl #2 ; detoken + type*4
+
+ ldr r8, [r1, #bool_decoder_user_buffer]
+
+ ldr r10, [lr, #detok_coef_probs] ; coef_probs[type]
+ ldr r5, [r1, #bool_decoder_count]
+ ldr r6, [r1, #bool_decoder_range]
+ ldr r4, [r1, #bool_decoder_value]
+
+ str r10, [sp, #l_coef_ptr]
+
+ ;align 4
+BLOCK_LOOP
+ ldr r3, [r9, #detok_ptr_onyxblock2context_leftabove]
+ ldr r2, [r9, #detok_A]
+ ldr r1, [r9, #detok_L]
+ ldrb r12, [r3, r11]! ; onyxblock2context[i]
+
+ cmp r7, #0 ; c = !type
+ moveq r7, #1
+ movne r7, #0
+
+ ldr r0, [r2, r12, lsl #2] ; A[onyxblock2context[i]]
+ add r1, r1, r12, lsl #4 ; L + onyxblock2context[i] << 4
+ ; A is ptr to ptr (**)
+ ; L is ptr to data (*[4])
+
+ ldrb r2, [r3, #c_onyxblock2above_offset] ; + above offset
+ ldrb r3, [r3, #c_onyxblock2left_offset] ; + left offset
+ mov lr, #c_entropy_nodes ; ENTROPY_NODES = 11
+;; ;++
+
+ ldr r2, [r0, r2, lsl #2]! ; A + above offset
+ ldr r3, [r1, r3, lsl #2]! ; L + left offset
+; VP8_COMBINEENTROPYCONTETEXTS(t, *a, *l) => t = ((*a) != 0) + ((*l) !=0)
+ cmp r2, #0 ; *a ?= 0
+ movne r2, #1 ; haha if a == 0 no need to set up another var to state that pretty sweet :)
+ cmp r3, #0 ; *l ?= 0
+ addne r2, r2, #1 ; t
+
+ str r1, [sp, #l_l_ptr] ; save &l
+ str r0, [sp, #l_a_ptr] ; save &a
+ smlabb r0, r2, lr, r10 ; Prob = coef_probs + (t * ENTROPY_NODES)
+ mov r1, #0 ; t = 0
+ str r7, [sp, #l_c]
+
+ ;align 4
+COEFF_LOOP
+ ldr r3, [r9, #detok_ptr_onyx_coef_bands_x]
+ ldr lr, [r9, #detok_onyx_coef_tree_ptr]
+
+ ; onyx_coef_bands_x is UINT16
+ add r3, r3, r7, lsl #1 ; coef_bands_x[c]
+ ldrh r3, [r3] ; UINT16
+
+ ;++
+ add r0, r0, r3 ; Prob += coef_bands_x[c]
+
+ ;align 4
+get_token_loop
+ ldrb r2, [r0, +r1, asr #1] ; Prob[t >> 1]
+ mov r3, r6, lsl #8 ; range << 8
+ sub r3, r3, #256 ; (range << 8) - (1 << 8)
+ mov r10, #1 ; 1
+
+ smlawb r2, r3, r2, r10 ; split = 1 + (((range-1) * probability) >> 8)
+
+ ldrb r12, [r8] ; load cx data byte in stall slot : r8 = bufptr
+ ;++
+
+ subs r3, r4, r2, lsl #24 ; value-(split<<24): used later to calculate shift for NORMALIZE
+ addhs r1, r1, #1 ; t += 1
+ movhs r4, r3 ; value -= bigsplit (split << 24)
+ subhs r2, r6, r2 ; range -= split
+ ; movlo r6, r2 ; range = split
+
+ ldrsb r1, [lr, r1] ; t = onyx_coef_tree_ptr[t]
+
+; NORMALIZE
+ clz r3, r2 ; vp8dx_bitreader_norm[range] + 24
+ sub r3, r3, #24 ; vp8dx_bitreader_norm[range]
+ subs r5, r5, r3 ; count -= shift
+ mov r6, r2, lsl r3 ; range <<= shift
+ mov r4, r4, lsl r3 ; value <<= shift
+
+; if count <= 0, += BR_COUNT; value |= *bufptr++ << (BR_COUNT-count); BR_COUNT = 8, but need to upshift values by +16
+ addle r5, r5, #8 ; count += 8
+ rsble r3, r5, #24 ; 24 - count
+ addle r8, r8, #1 ; bufptr++
+ orrle r4, r4, r12, lsl r3 ; value |= *bufptr << shift + 16
+
+ cmp r1, #0 ; t ?= 0
+ bgt get_token_loop ; while (t > 0)
+
+ cmn r1, #c_dct_eob_token ; if(t == -DCT_EOB_TOKEN)
+ beq END_OF_BLOCK ; break
+
+ rsb lr, r1, #0 ; v = -t;
+
+ cmp lr, #4 ; if(v > FOUR_TOKEN)
+ ble SKIP_EXTRABITS
+
+ ldr r3, [r9, #detok_teb_base_ptr]
+ mov r11, #1 ; 1 in split = 1 + ... nope, v+= 1 << bits_count
+ add r7, r3, lr, lsl #4 ; detok_teb_base_ptr + (v << 4)
+
+ ldrsh lr, [r7, #tokenextrabits_min_val] ; v = teb_ptr->min_val
+ ldrsh r0, [r7, #tokenextrabits_length] ; bits_count = teb_ptr->Length
+
+extrabits_loop
+ add r3, r0, r7 ; &teb_ptr->Probs[bits_count]
+
+ ldrb r2, [r3, #4] ; probability. why +4?
+ mov r3, r6, lsl #8 ; range << 8
+ sub r3, r3, #256 ; range << 8 + 1 << 8
+
+ smlawb r2, r3, r2, r11 ; split = 1 + (((range-1) * probability) >> 8)
+
+ ldrb r12, [r8] ; *bufptr
+ ;++
+
+ subs r10, r4, r2, lsl #24 ; value - (split<<24)
+ movhs r4, r10 ; value = value - (split << 24)
+ subhs r2, r6, r2 ; range = range - split
+ addhs lr, lr, r11, lsl r0 ; v += ((UINT16)1<<bits_count)
+
+; NORMALIZE
+ clz r3, r2 ; shift - leading zeros in split
+ sub r3, r3, #24 ; don't count first 3 bytes
+ subs r5, r5, r3 ; count -= shift
+ mov r6, r2, lsl r3 ; range = range << shift
+ mov r4, r4, lsl r3 ; value <<= shift
+
+ addle r5, r5, #8 ; count += BR_COUNT
+ addle r8, r8, #1 ; bufptr++
+ rsble r3, r5, #24 ; BR_COUNT - count
+ orrle r4, r4, r12, lsl r3 ; value |= *bufptr << (BR_COUNT - count)
+
+ subs r0, r0, #1 ; bits_count --
+ bpl extrabits_loop
+
+
+SKIP_EXTRABITS
+ ldr r11, [sp, #l_qcoeff]
+ ldr r0, [sp, #l_coef_ptr] ; Prob = coef_probs
+
+ cmp r1, #0 ; check for nonzero token - if (t)
+ beq SKIP_EOB_CHECK ; if t is zero, we will skip the eob table chec
+
+ add r3, r6, #1 ; range + 1
+ mov r2, r3, lsr #1 ; split = (range + 1) >> 1
+
+ subs r3, r4, r2, lsl #24 ; value - (split<<24)
+ movhs r4, r3 ; value -= (split << 24)
+ subhs r2, r6, r2 ; range -= split
+ mvnhs r3, lr ; -v
+ addhs lr, r3, #1 ; v = (v ^ -1) + 1
+
+; NORMALIZE
+ clz r3, r2 ; leading 0s in split
+ sub r3, r3, #24 ; shift
+ subs r5, r5, r3 ; count -= shift
+ mov r6, r2, lsl r3 ; range <<= shift
+ mov r4, r4, lsl r3 ; value <<= shift
+ ldrleb r2, [r8], #1 ; *(bufptr++)
+ addle r5, r5, #8 ; count += 8
+ rsble r3, r5, #24 ; BR_COUNT - count
+ orrle r4, r4, r2, lsl r3 ; value |= *bufptr << (BR_COUNT - count)
+
+ add r0, r0, #0xB ; Prob += ENTROPY_NODES (11)
+
+ cmn r1, #1 ; t < -ONE_TOKEN
+
+ addlt r0, r0, #0xB ; Prob += ENTROPY_NODES (11)
+
+ mvn r1, #1 ; t = -1 ???? C is -2
+
+SKIP_EOB_CHECK
+ ldr r7, [sp, #l_c] ; c
+ ldr r3, [r9, #detok_scan]
+ add r1, r1, #2 ; t+= 2
+ cmp r7, #(0x10 - 1) ; c should will be one higher
+
+ ldr r3, [r3, +r7, lsl #2] ; scan[c] this needs pre-inc c value
+ add r7, r7, #1 ; c++
+ add r3, r11, r3, lsl #1 ; qcoeff + scan[c]
+
+ str r7, [sp, #l_c] ; store c
+ strh lr, [r3] ; qcoef_ptr[scan[c]] = v
+
+ blt COEFF_LOOP
+
+ sub r7, r7, #1 ; if(t != -DCT_EOB_TOKEN) --c ; never stored! no condition!
+
+END_OF_BLOCK
+ ldr r3, [sp, #l_type] ; type
+ ldr r10, [sp, #l_coef_ptr] ; coef_ptr
+ ldr r0, [sp, #l_qcoeff] ; qcoeff
+ ldr r11, [sp, #l_i] ; i
+ ldr r12, [sp, #l_stop] ; stop
+
+ cmp r3, #0 ; type ?= 0
+ moveq r1, #1
+ movne r1, #0
+ add r3, r11, r9 ; detok + i
+
+ cmp r7, r1 ; c ?= !type
+ strb r7, [r3, #detok_eob] ; eob[i] = c
+
+ ldr r7, [sp, #l_l_ptr] ; l
+ ldr r2, [sp, #l_a_ptr] ; a
+ movne r3, #1 ; t
+ moveq r3, #0
+
+ add r0, r0, #0x20 ; qcoeff += 32 (16 * 2?)
+ add r11, r11, #1 ; i++
+ str r3, [r7] ; *l = t
+ str r3, [r2] ; *a = t
+ str r0, [sp, #l_qcoeff] ; qcoeff
+ str r11, [sp, #l_i] ; i
+
+ cmp r11, r12 ; i >= stop ? VERIFY should be strictly LT(<)?
+ ldr r7, [sp, #l_type] ; type
+ mov lr, #0xB ; 11 (ENTORPY_NODES?)
+
+ blt BLOCK_LOOP
+
+ cmp r11, #0x19 ; i ?= 25
+ bne ln2_decode_mb_to
+
+ ldr r12, [r9, #detok_qcoeff_start_ptr]
+ ldr r10, [r9, #detok_coef_probs]
+ mov r7, #0 ; type/i = 0
+ mov r3, #0x10 ; stop = 0
+ str r12, [sp, #l_qcoeff] ; qcoeff_ptr = qcoeff_start_ptr
+ str r7, [sp, #l_i]
+ str r7, [sp, #l_type]
+ str r3, [sp, #l_stop]
+
+ str r10, [sp, #l_coef_ptr] ; coef_probs = coef_probs[type] (0)
+
+ b BLOCK_LOOP
+
+ln2_decode_mb_to
+ cmp r11, #0x10 ; i ?= 16
+ bne ln1_decode_mb_to
+
+ mov r10, #detok_coef_probs
+ add r10, r10, #2*4 ; coef_probs[type]
+ ldr r10, [r9, r10] ; detok + 48 - THIS IS PROBABLY THE ISSUE: NEW STRUCTURE
+
+ mov r7, #2 ; type = 2
+ mov r3, #0x18 ; stop = 24
+
+ str r7, [sp, #l_type]
+ str r3, [sp, #l_stop]
+
+ str r10, [sp, #l_coef_ptr] ; coef_probs = coef_probs[type] - didn't want to add 2 to coef_probs
+ b BLOCK_LOOP
+
+ln1_decode_mb_to
+ ldr r2, [sp, #l_bc]
+ mov r0, #0
+ nop
+
+ str r8, [r2, #bool_decoder_user_buffer]
+ str r5, [r2, #bool_decoder_count]
+ str r4, [r2, #bool_decoder_value]
+ str r6, [r2, #bool_decoder_range]
+
+ add sp, sp, #l_stacksize
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP ; |vp8_decode_mb_tokens_v6|
+
+ END
diff --git a/vp8/decoder/arm/detokenize_arm.h b/vp8/decoder/arm/detokenize_arm.h
new file mode 100644
index 0000000..1c53f7b
--- /dev/null
+++ b/vp8/decoder/arm/detokenize_arm.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef DETOKENIZE_ARM_H
+#define DETOKENIZE_ARM_H
+
+#if HAVE_ARMV6
+#if CONFIG_ARM_ASM_DETOK
+void vp8_init_detokenizer(VP8D_COMP *dx);
+void vp8_decode_mb_tokens_v6(DETOK *detoken, int type);
+#endif
+#endif
+
+#endif
diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c
index 7407417..34faae3 100644
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -14,6 +14,7 @@
#include "onyxd_int.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
+#include "detokenize.h"
#define BOOL_DATA UINT8
@@ -103,6 +104,34 @@
*l = 0;
}
}
+
+#if CONFIG_ARM_ASM_DETOK
+DECLARE_ALIGNED(16, const UINT8, vp8_block2context_leftabove[25*3]) =
+{
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, //end of vp8_block2context
+ 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 1, 1, 0, 0, 1, 1, 0, //end of vp8_block2left
+ 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0 //end of vp8_block2above
+};
+
+void vp8_init_detokenizer(VP8D_COMP *dx)
+{
+ const VP8_COMMON *const oc = & dx->common;
+ MACROBLOCKD *x = & dx->mb;
+
+ dx->detoken.vp8_coef_tree_ptr = vp8_coef_tree;
+ dx->detoken.ptr_onyxblock2context_leftabove = vp8_block2context_leftabove;
+ dx->detoken.ptr_onyx_coef_bands_x = vp8_coef_bands_x;
+ dx->detoken.scan = vp8_default_zig_zag1d;
+ dx->detoken.teb_base_ptr = vp8d_token_extra_bits2;
+ dx->detoken.qcoeff_start_ptr = &x->qcoeff[0];
+
+ dx->detoken.coef_probs[0] = (oc->fc.coef_probs [0] [ 0 ] [0]);
+ dx->detoken.coef_probs[1] = (oc->fc.coef_probs [1] [ 0 ] [0]);
+ dx->detoken.coef_probs[2] = (oc->fc.coef_probs [2] [ 0 ] [0]);
+ dx->detoken.coef_probs[3] = (oc->fc.coef_probs [3] [ 0 ] [0]);
+}
+#endif
+
DECLARE_ALIGNED(16, extern const unsigned int, vp8dx_bitreader_norm[256]);
#define FILL \
if(count < 0) \
@@ -200,6 +229,35 @@
}\
NORMALIZE
+#if CONFIG_ARM_ASM_DETOK
+int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
+{
+ int eobtotal = 0;
+ int i, type;
+
+ dx->detoken.current_bc = x->current_bc;
+ dx->detoken.A = x->above_context;
+ dx->detoken.L = x->left_context;
+
+ type = 3;
+
+ if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+ {
+ type = 1;
+ eobtotal -= 16;
+ }
+
+ vp8_decode_mb_tokens_v6(&dx->detoken, type);
+
+ for (i = 0; i < 25; i++)
+ {
+ x->block[i].eob = dx->detoken.eob[i];
+ eobtotal += dx->detoken.eob[i];
+ }
+
+ return eobtotal;
+}
+#else
int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
{
ENTROPY_CONTEXT **const A = x->above_context;
@@ -395,3 +453,4 @@
return eobtotal;
}
+#endif //!CONFIG_ASM_DETOK
diff --git a/vp8/decoder/detokenize.h b/vp8/decoder/detokenize.h
index 2f6b4a9..aa98dea 100644
--- a/vp8/decoder/detokenize.h
+++ b/vp8/decoder/detokenize.h
@@ -9,12 +9,16 @@
*/
-#ifndef detokenize_h
-#define detokenize_h 1
+#ifndef DETOKENIZE_H
+#define DETOKENIZE_H
#include "onyxd_int.h"
+#if ARCH_ARM
+#include "arm/detokenize_arm.h"
+#endif
+
void vp8_reset_mb_tokens_context(MACROBLOCKD *x);
int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);
-#endif /* detokenize_h */
+#endif /* DETOKENIZE_H */
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 728d5ca..5a88ba0 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -29,13 +29,11 @@
#include "vpx_scale/vpxscale.h"
#include "systemdependent.h"
#include "vpx_ports/vpx_timer.h"
-
+#include "detokenize.h"
extern void vp8_init_loop_filter(VP8_COMMON *cm);
-
extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
-// DEBUG code
#if CONFIG_DEBUG
void vp8_recon_write_yuv_frame(unsigned char *name, YV12_BUFFER_CONFIG *s)
{
@@ -129,6 +127,9 @@
cm->last_sharpness_level = cm->sharpness_level;
}
+#if CONFIG_ARM_ASM_DETOK
+ vp8_init_detokenizer(pbi);
+#endif
pbi->common.error.setjmp = 0;
return (VP8D_PTR) pbi;
}
diff --git a/vp8/vp8dx_arm.mk b/vp8/vp8dx_arm.mk
index e9674ca..d40f76e 100644
--- a/vp8/vp8dx_arm.mk
+++ b/vp8/vp8dx_arm.mk
@@ -11,24 +11,17 @@
#VP8_DX_SRCS list is modified according to different platforms.
-#File list for arm
-# decoder
-#VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/decodframe_arm.c
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/dequantize_arm.c
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/dsystemdependent.c
-
-#VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6) += decoder/decodframe.c
-VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6) += decoder/dequantize.c
VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6) += decoder/generic/dsystemdependent.c
+VP8_DX_SRCS-$(CONFIG_ARM_ASM_DETOK) += decoder/arm/detokenize$(ASM)
#File list for armv6
-# decoder
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_idct_v6$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantize_v6$(ASM)
#File list for neon
-# decoder
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_dc_idct_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_idct_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantizeb_neon$(ASM)