Merge "mips msa vp8 loop filter optimization"
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 6294af1..4ee4ad4 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -494,6 +494,14 @@
make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));
#endif
+#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && \
+ !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+ SSE2, Trans4x4WHT,
+ ::testing::Values(
+ make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_sse2, 0, VPX_BITS_8)));
+#endif
+
#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
SSE2, Trans4x4DCT,
diff --git a/tools_common.c b/tools_common.c
index 901734e..8d356af 100644
--- a/tools_common.c
+++ b/tools_common.c
@@ -392,7 +392,7 @@
(uint16_t *)(src->planes[plane] + y * src->stride[plane]);
uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
for (x = 0; x < w; x++) {
- *p_dst++ = *p_src++;
+ *p_dst++ = (uint8_t)(*p_src++);
}
}
}
diff --git a/vp8/encoder/x86/quantize_ssse3.c b/vp8/encoder/x86/quantize_ssse3.c
index 448217f..14282db 100644
--- a/vp8/encoder/x86/quantize_ssse3.c
+++ b/vp8/encoder/x86/quantize_ssse3.c
@@ -17,7 +17,7 @@
#include <intrin.h>
#pragma intrinsic(_BitScanReverse)
static int bsr(int mask) {
- int eob;
+ unsigned long eob;
_BitScanReverse(&eob, mask);
eob++;
if (mask == 0)
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index fd8fe3f..8054cdf 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -128,6 +128,11 @@
ENTROPY_CONTEXT *left_context;
int16_t seg_dequant[MAX_SEGMENTS][2];
+ // number of 4x4s in current block
+ uint16_t n4_w, n4_h;
+ // log2 of n4_w, n4_h
+ uint8_t n4_wl, n4_hl;
+
// encoder
const int16_t *dequant;
};
@@ -144,6 +149,8 @@
typedef struct macroblockd {
struct macroblockd_plane plane[MAX_MB_PLANE];
+ uint8_t bmode_blocks_wl;
+ uint8_t bmode_blocks_hl;
FRAME_COUNTS *counts;
TileInfo tile;
@@ -187,12 +194,6 @@
int bd;
#endif
- /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-#if CONFIG_VP9_ENCODER
- DECLARE_ALIGNED(16, tran_low_t, dqcoeff[64 * 64]);
-#else
- DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
-#endif
int lossless;
int corrupted;
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index cee1682..cbce2dd 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -37,6 +37,10 @@
_mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
(int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
+#define octa_set_epi16(a, b, c, d, e, f, g, h) \
+ _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
+ (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
+
// Constants:
// for (int i = 1; i< 32; ++i)
// printf("static const int cospi_%d_64 = %.0f;\n", i,
@@ -158,7 +162,7 @@
// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
#else
-#define WRAPLOW(x, bd) (x)
+#define WRAPLOW(x, bd) ((int32_t)(x))
#endif // CONFIG_EMULATE_HARDWARE
void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 8f1240a..ca0dfc8 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -348,11 +348,12 @@
(const vp9_prob (*)[PARTITION_TYPES - 1])cm->fc->partition_prob;
}
-static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) {
+static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd,
+ tran_low_t *dqcoeff) {
int i;
for (i = 0; i < MAX_MB_PLANE; ++i) {
- xd->plane[i].dqcoeff = xd->dqcoeff;
+ xd->plane[i].dqcoeff = dqcoeff;
xd->above_context[i] = cm->above_context +
i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 524e79f..538f1ed 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -475,7 +475,7 @@
specialize qw/vp9_iwht4x4_1_add msa/;
add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_iwht4x4_16_add msa/;
+ specialize qw/vp9_iwht4x4_16_add msa/, "$sse2_x86inc";
}
}
diff --git a/vp9/common/x86/vp9_idct_sse2.asm b/vp9/common/x86/vp9_idct_sse2.asm
new file mode 100644
index 0000000..9619e37
--- /dev/null
+++ b/vp9/common/x86/vp9_idct_sse2.asm
@@ -0,0 +1,88 @@
+;
+; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+ paddw m0, m2
+ psubw m3, m1
+ psubw m4, m0, m3
+ psraw m4, 1
+ psubw m5, m4, m1 ;b1
+ psubw m4, m2 ;c1
+ psubw m0, m5
+ paddw m3, m4
+ ; m0 a0
+ SWAP 1, 4 ; m1 c1
+ SWAP 2, 3 ; m2 d1
+ SWAP 3, 5 ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+ SWAP 4, 0
+ SWAP 5, 2
+ punpcklwd m4, m1
+ pshufd m0, m4, 0x0e
+ punpcklwd m5, m3
+ pshufd m2, m5, 0x0e
+ SWAP 1, 4
+ SWAP 3, 0
+ punpckldq m1, m5
+ pshufd m4, m1, 0x0e
+ punpckldq m3, m2
+ pshufd m0, m3, 0x0e
+ SWAP 2, 3, 0, 1, 4
+%endmacro
+
+; transposes a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
+%macro TRANSPOSE_4X4_WIDE 0
+ mova m3, m0
+ punpcklwd m0, m1
+ punpckhwd m3, m1
+ mova m2, m0
+ punpcklwd m0, m3
+ punpckhwd m2, m3
+ pshufd m1, m0, 0x0e
+ pshufd m3, m2, 0x0e
+%endmacro
+
+%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero
+ movq m%3, [outputq]
+ movq m%4, [outputq + strideq]
+ punpcklbw m%3, m%5
+ punpcklbw m%4, m%5
+ paddw m%1, m%3
+ paddw m%2, m%4
+ packuswb m%1, m%5
+ packuswb m%2, m%5
+ movd [outputq], m%1
+ movd [outputq + strideq], m%2
+%endmacro
+
+INIT_XMM sse2
+cglobal iwht4x4_16_add, 3, 4, 7, input, output, stride
+ mova m0, [inputq + 0] ;a1
+ mova m1, [inputq + 16] ;c1
+
+ psraw m0, 2
+ psraw m1, 2
+
+ TRANSPOSE_4X4_WIDE
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+ TRANSFORM_COLS
+
+ pxor m4, m4
+ ADD_STORE_4P_2X 0, 1, 5, 6, 4
+ lea outputq, [outputq + 2 * strideq]
+ ADD_STORE_4P_2X 2, 3, 5, 6, 4
+
+ RET
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index e321dbe..fe8af54 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -1333,43 +1333,47 @@
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
__m128i x8, x9, x10, x11, x12, x13, x14, x15;
- // Read in 16 lines
- x0 = _mm_loadl_epi64((__m128i *)in0);
- x8 = _mm_loadl_epi64((__m128i *)in1);
- x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
- x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
- x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
- x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
- x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));
- x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));
- x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));
- x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));
- x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));
- x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));
- x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));
- x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));
- x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));
- x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));
+ // 2-way interleave w/hoisting of unpacks
+ x0 = _mm_loadl_epi64((__m128i *)in0); // 1
+ x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); // 3
+ x0 = _mm_unpacklo_epi8(x0, x1); // 1
- x0 = _mm_unpacklo_epi8(x0, x1);
- x1 = _mm_unpacklo_epi8(x2, x3);
- x2 = _mm_unpacklo_epi8(x4, x5);
- x3 = _mm_unpacklo_epi8(x6, x7);
+ x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); // 5
+ x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p)); // 7
+ x1 = _mm_unpacklo_epi8(x2, x3); // 2
- x8 = _mm_unpacklo_epi8(x8, x9);
- x9 = _mm_unpacklo_epi8(x10, x11);
- x10 = _mm_unpacklo_epi8(x12, x13);
- x11 = _mm_unpacklo_epi8(x14, x15);
+ x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p)); // 9
+ x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p)); // 11
+ x2 = _mm_unpacklo_epi8(x4, x5); // 3
- x4 = _mm_unpacklo_epi16(x0, x1);
- x5 = _mm_unpacklo_epi16(x2, x3);
- x12 = _mm_unpacklo_epi16(x8, x9);
- x13 = _mm_unpacklo_epi16(x10, x11);
+ x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p)); // 13
+ x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p)); // 15
+ x3 = _mm_unpacklo_epi8(x6, x7); // 4
+ x4 = _mm_unpacklo_epi16(x0, x1); // 9
- x6 = _mm_unpacklo_epi32(x4, x5);
- x7 = _mm_unpackhi_epi32(x4, x5);
- x14 = _mm_unpacklo_epi32(x12, x13);
- x15 = _mm_unpackhi_epi32(x12, x13);
+ x8 = _mm_loadl_epi64((__m128i *)in1); // 2
+ x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); // 4
+ x8 = _mm_unpacklo_epi8(x8, x9); // 5
+ x5 = _mm_unpacklo_epi16(x2, x3); // 10
+
+ x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); // 6
+ x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p)); // 8
+ x9 = _mm_unpacklo_epi8(x10, x11); // 6
+
+ x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p)); // 10
+ x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p)); // 12
+ x10 = _mm_unpacklo_epi8(x12, x13); // 7
+ x12 = _mm_unpacklo_epi16(x8, x9); // 11
+
+ x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p)); // 14
+ x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p)); // 16
+ x11 = _mm_unpacklo_epi8(x14, x15); // 8
+ x13 = _mm_unpacklo_epi16(x10, x11); // 12
+
+ x6 = _mm_unpacklo_epi32(x4, x5); // 13
+ x7 = _mm_unpackhi_epi32(x4, x5); // 14
+ x14 = _mm_unpacklo_epi32(x12, x13); // 15
+ x15 = _mm_unpackhi_epi32(x12, x13); // 16
// Store first 4-line result
_mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
@@ -1405,33 +1409,36 @@
x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07
x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17
- x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27
- x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37
- x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47
- x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57
- x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67
- x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77
// 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
x0 = _mm_unpacklo_epi8(x0, x1);
+
+ x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27
+ x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37
// 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
x1 = _mm_unpacklo_epi8(x2, x3);
+
+ x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47
+ x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57
// 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
x2 = _mm_unpacklo_epi8(x4, x5);
+
+ x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67
+ x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77
// 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
x3 = _mm_unpacklo_epi8(x6, x7);
+
// 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
x4 = _mm_unpacklo_epi16(x0, x1);
// 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
x5 = _mm_unpacklo_epi16(x2, x3);
// 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
x6 = _mm_unpacklo_epi32(x4, x5);
- // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
- x7 = _mm_unpackhi_epi32(x4, x5);
-
_mm_storel_pd((double *)(out + 0*out_p),
_mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70
_mm_storeh_pd((double *)(out + 1*out_p),
_mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ x7 = _mm_unpackhi_epi32(x4, x5);
_mm_storel_pd((double *)(out + 2*out_p),
_mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72
_mm_storeh_pd((double *)(out + 3*out_p),
@@ -1443,13 +1450,13 @@
x5 = _mm_unpackhi_epi16(x2, x3);
// 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
x6 = _mm_unpacklo_epi32(x4, x5);
- // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
- x7 = _mm_unpackhi_epi32(x4, x5);
-
_mm_storel_pd((double *)(out + 4*out_p),
_mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74
_mm_storeh_pd((double *)(out + 5*out_p),
_mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ x7 = _mm_unpackhi_epi32(x4, x5);
+
_mm_storel_pd((double *)(out + 6*out_p),
_mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76
_mm_storeh_pd((double *)(out + 7*out_p),
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 6c64540..b3cf3fd 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -182,35 +182,29 @@
}
}
-static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
- TX_SIZE tx_size, uint8_t *dst, int stride,
- int eob) {
+static void inverse_transform_block_inter(MACROBLOCKD* xd, int plane,
+ const TX_SIZE tx_size,
+ uint8_t *dst, int stride,
+ int eob) {
struct macroblockd_plane *const pd = &xd->plane[plane];
if (eob > 0) {
- TX_TYPE tx_type = DCT_DCT;
tran_low_t *const dqcoeff = pd->dqcoeff;
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
if (xd->lossless) {
- tx_type = DCT_DCT;
vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
} else {
- const PLANE_TYPE plane_type = pd->plane_type;
switch (tx_size) {
case TX_4X4:
- tx_type = get_tx_type_4x4(plane_type, xd, block);
- vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+ vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);
break;
case TX_8X8:
- tx_type = get_tx_type(plane_type, xd);
- vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+ vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);
break;
case TX_16X16:
- tx_type = get_tx_type(plane_type, xd);
- vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+ vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);
break;
case TX_32X32:
- tx_type = DCT_DCT;
vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
break;
default:
@@ -219,25 +213,19 @@
}
} else {
if (xd->lossless) {
- tx_type = DCT_DCT;
vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
} else {
- const PLANE_TYPE plane_type = pd->plane_type;
switch (tx_size) {
case TX_4X4:
- tx_type = get_tx_type_4x4(plane_type, xd, block);
- vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+ vp9_idct4x4_add(dqcoeff, dst, stride, eob);
break;
case TX_8X8:
- tx_type = get_tx_type(plane_type, xd);
- vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+ vp9_idct8x8_add(dqcoeff, dst, stride, eob);
break;
case TX_16X16:
- tx_type = get_tx_type(plane_type, xd);
- vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+ vp9_idct16x16_add(dqcoeff, dst, stride, eob);
break;
case TX_32X32:
- tx_type = DCT_DCT;
vp9_idct32x32_add(dqcoeff, dst, stride, eob);
break;
default:
@@ -248,25 +236,19 @@
}
#else
if (xd->lossless) {
- tx_type = DCT_DCT;
vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
} else {
- const PLANE_TYPE plane_type = pd->plane_type;
switch (tx_size) {
case TX_4X4:
- tx_type = get_tx_type_4x4(plane_type, xd, block);
- vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+ vp9_idct4x4_add(dqcoeff, dst, stride, eob);
break;
case TX_8X8:
- tx_type = get_tx_type(plane_type, xd);
- vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+ vp9_idct8x8_add(dqcoeff, dst, stride, eob);
break;
case TX_16X16:
- tx_type = get_tx_type(plane_type, xd);
- vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+ vp9_idct16x16_add(dqcoeff, dst, stride, eob);
break;
case TX_32X32:
- tx_type = DCT_DCT;
vp9_idct32x32_add(dqcoeff, dst, stride, eob);
break;
default:
@@ -277,7 +259,97 @@
#endif // CONFIG_VP9_HIGHBITDEPTH
if (eob == 1) {
- memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
+ dqcoeff[0] = 0;
+ } else {
+ if (tx_size <= TX_16X16 && eob <= 10)
+ memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+ else if (tx_size == TX_32X32 && eob <= 34)
+ memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+ else
+ memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
+ }
+ }
+}
+
+static void inverse_transform_block_intra(MACROBLOCKD* xd, int plane,
+ const TX_TYPE tx_type,
+ const TX_SIZE tx_size,
+ uint8_t *dst, int stride,
+ int eob) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ if (eob > 0) {
+ tran_low_t *const dqcoeff = pd->dqcoeff;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (xd->lossless) {
+ vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+ } else {
+ switch (tx_size) {
+ case TX_4X4:
+ vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ case TX_8X8:
+ vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ case TX_16X16:
+ vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ case TX_32X32:
+ vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ default:
+ assert(0 && "Invalid transform size");
+ }
+ }
+ } else {
+ if (xd->lossless) {
+ vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+ } else {
+ switch (tx_size) {
+ case TX_4X4:
+ vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+ break;
+ case TX_8X8:
+ vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+ break;
+ case TX_16X16:
+ vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+ break;
+ case TX_32X32:
+ vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+ break;
+ default:
+ assert(0 && "Invalid transform size");
+ return;
+ }
+ }
+ }
+#else
+ if (xd->lossless) {
+ vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+ } else {
+ switch (tx_size) {
+ case TX_4X4:
+ vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+ break;
+ case TX_8X8:
+ vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+ break;
+ case TX_16X16:
+ vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+ break;
+ case TX_32X32:
+ vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+ break;
+ default:
+ assert(0 && "Invalid transform size");
+ return;
+ }
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ if (eob == 1) {
+ dqcoeff[0] = 0;
} else {
if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
@@ -289,6 +361,16 @@
}
}
+static INLINE void dec_txfrm_block_to_raster_xy(int bwl,
+ TX_SIZE tx_size, int block,
+ int *x, int *y) {
+ const int tx_cols_log2 = bwl - tx_size;
+ const int tx_cols = 1 << tx_cols_log2;
+ const int raster_mb = block >> (tx_size << 1);
+ *x = (raster_mb & (tx_cols - 1)) << tx_size;
+ *y = (raster_mb >> tx_cols_log2) << tx_size;
+}
+
struct intra_args {
MACROBLOCKD *xd;
vp9_reader *r;
@@ -296,7 +378,6 @@
};
static void predict_and_reconstruct_intra_block(int plane, int block,
- BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
struct intra_args *const args = (struct intra_args *)arg;
MACROBLOCKD *const xd = args->xd;
@@ -306,48 +387,39 @@
: mi->mbmi.uv_mode;
int x, y;
uint8_t *dst;
- txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
+ const int bwl = pd->n4_wl;
+ dec_txfrm_block_to_raster_xy(bwl, tx_size, block, &x, &y);
dst = &pd->dst.buf[4 * y * pd->dst.stride + 4 * x];
vp9_predict_intra_block(xd, block >> (tx_size << 1),
- b_width_log2_lookup[plane_bsize], tx_size, mode,
+ bwl, tx_size, mode,
dst, pd->dst.stride, dst, pd->dst.stride,
x, y, plane);
if (!mi->mbmi.skip) {
+ const TX_TYPE tx_type = (plane || xd->lossless) ?
+ DCT_DCT : intra_mode_to_tx_type_lookup[mode];
const scan_order *sc = (plane || xd->lossless) ?
- &vp9_default_scan_orders[tx_size] :
- &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]];
- const int eob = vp9_decode_block_tokens(xd, plane, sc,
- plane_bsize, x, y, tx_size,
+ &vp9_default_scan_orders[tx_size] : &vp9_scan_orders[tx_size][tx_type];
+ const int eob = vp9_decode_block_tokens(xd, plane, sc, x, y, tx_size,
args->r, args->seg_id);
- inverse_transform_block(xd, plane, block, tx_size, dst, pd->dst.stride,
- eob);
+ inverse_transform_block_intra(xd, plane, tx_type, tx_size,
+ dst, pd->dst.stride, eob);
}
}
-struct inter_args {
- MACROBLOCKD *xd;
- vp9_reader *r;
- int *eobtotal;
- int seg_id;
-};
-
-static void reconstruct_inter_block(int plane, int block,
- BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, void *arg) {
- struct inter_args *args = (struct inter_args *)arg;
- MACROBLOCKD *const xd = args->xd;
+static int reconstruct_inter_block(MACROBLOCKD *const xd, vp9_reader *r,
+ MB_MODE_INFO *const mbmi, int plane,
+ int row, int col, TX_SIZE tx_size) {
struct macroblockd_plane *const pd = &xd->plane[plane];
- int x, y, eob;
const scan_order *sc = &vp9_default_scan_orders[tx_size];
- txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
- eob = vp9_decode_block_tokens(xd, plane, sc, plane_bsize,
- x, y, tx_size, args->r, args->seg_id);
- inverse_transform_block(xd, plane, block, tx_size,
- &pd->dst.buf[4 * y * pd->dst.stride + 4 * x],
- pd->dst.stride, eob);
- *args->eobtotal += eob;
+ const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size, r,
+ mbmi->segment_id);
+
+ inverse_transform_block_inter(xd, plane, tx_size,
+ &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
+ pd->dst.stride, eob);
+ return eob;
}
static void build_mc_border(const uint8_t *src, int src_stride,
@@ -648,8 +720,7 @@
static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
MACROBLOCKD *xd,
- int mi_row, int mi_col,
- BLOCK_SIZE bsize) {
+ int mi_row, int mi_col) {
int plane;
const int mi_x = mi_col * MI_SIZE;
const int mi_y = mi_row * MI_SIZE;
@@ -659,15 +730,13 @@
const int is_compound = has_second_ref(&mi->mbmi);
for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
- const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
- &xd->plane[plane]);
struct macroblockd_plane *const pd = &xd->plane[plane];
struct buf_2d *const dst_buf = &pd->dst;
- const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
- const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+ const int num_4x4_w = pd->n4_w;
+ const int num_4x4_h = pd->n4_h;
- const int bw = 4 * num_4x4_w;
- const int bh = 4 * num_4x4_h;
+ const int n4w_x4 = 4 * num_4x4_w;
+ const int n4h_x4 = 4 * num_4x4_h;
int ref;
for (ref = 0; ref < 1 + is_compound; ++ref) {
@@ -680,11 +749,10 @@
if (sb_type < BLOCK_8X8) {
int i = 0, x, y;
- assert(bsize == BLOCK_8X8);
for (y = 0; y < num_4x4_h; ++y) {
for (x = 0; x < num_4x4_w; ++x) {
const MV mv = average_split_mvs(pd, mi, ref, i++);
- dec_build_inter_predictors(pbi, xd, plane, bw, bh,
+ dec_build_inter_predictors(pbi, xd, plane, n4w_x4, n4h_x4,
4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel,
sf, pre_buf, dst_buf, &mv,
ref_frame_buf, is_scaled, ref);
@@ -692,8 +760,8 @@
}
} else {
const MV mv = mi->mbmi.mv[ref].as_mv;
- dec_build_inter_predictors(pbi, xd, plane, bw, bh,
- 0, 0, bw, bh, mi_x, mi_y, kernel,
+ dec_build_inter_predictors(pbi, xd, plane, n4w_x4, n4h_x4,
+ 0, 0, n4w_x4, n4h_x4, mi_x, mi_y, kernel,
sf, pre_buf, dst_buf, &mv, ref_frame_buf,
is_scaled, ref);
}
@@ -701,24 +769,105 @@
}
}
+static INLINE TX_SIZE dec_get_uv_tx_size(const MB_MODE_INFO *mbmi,
+ int n4_wl, int n4_hl) {
+ // get minimum log2 num4x4s dimension
+ const int x = MIN(n4_wl, n4_hl);
+ return MIN(mbmi->tx_size, x);
+}
+
+// TODO(slavarnway): Eliminate the foreach_ functions in future commits.
+// NOTE: Jingning removed the foreach_ for recon inter in a previous commit.
+
+typedef void (*dec_foreach_transformed_block_visitor)(int plane, int block,
+ TX_SIZE tx_size,
+ void *arg);
+
+static void dec_foreach_transformed_block_in_plane(
+ const MACROBLOCKD *const xd,
+ int plane,
+ dec_foreach_transformed_block_visitor visit, void *arg) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const MB_MODE_INFO* mbmi = &xd->mi[0]->mbmi;
+ // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+ // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+ // transform size varies per plane, look it up in a common way.
+ const TX_SIZE tx_size =
+ plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
+ : mbmi->tx_size;
+ const int num_4x4_w = pd->n4_w;
+ const int num_4x4_h = pd->n4_h;
+ const int step = 1 << (tx_size << 1);
+ int i = 0, r, c;
+
+ // If mb_to_right_edge is < 0 we are in a situation in which
+ // the current block size extends into the UMV and we won't
+ // visit the sub blocks that are wholly within the UMV.
+ const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
+ xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+ const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
+ xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+ // Keep track of the row and column of the blocks we use so that we know
+ // if we are in the unrestricted motion border.
+ for (r = 0; r < max_blocks_high; r += (1 << tx_size)) {
+ for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
+ // Skip visiting the sub blocks that are wholly within the UMV.
+ if (c < max_blocks_wide)
+ visit(plane, i, tx_size, arg);
+ i += step;
+ }
+ }
+}
+
+static void dec_foreach_transformed_block(const MACROBLOCKD* const xd,
+ dec_foreach_transformed_block_visitor visit, void *arg) {
+ int plane;
+
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+ dec_foreach_transformed_block_in_plane(xd, plane, visit, arg);
+}
+
+static INLINE void dec_reset_skip_context(MACROBLOCKD *xd) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ struct macroblockd_plane *const pd = &xd->plane[i];
+ memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) * pd->n4_w);
+ memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) * pd->n4_h);
+ }
+}
+
+static void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, int bwl,
+ int bhl) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].n4_w = (bw << 1) >> xd->plane[i].subsampling_x;
+ xd->plane[i].n4_h = (bh << 1) >> xd->plane[i].subsampling_y;
+ xd->plane[i].n4_wl = bwl - xd->plane[i].subsampling_x;
+ xd->plane[i].n4_hl = bhl - xd->plane[i].subsampling_y;
+ }
+}
+
static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
- BLOCK_SIZE bsize, int mi_row, int mi_col) {
- const int bw = num_8x8_blocks_wide_lookup[bsize];
- const int bh = num_8x8_blocks_high_lookup[bsize];
- const int x_mis = MIN(bw, cm->mi_cols - mi_col);
- const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int bw, int bh, int x_mis, int y_mis,
+ int bwl, int bhl) {
const int offset = mi_row * cm->mi_stride + mi_col;
int x, y;
const TileInfo *const tile = &xd->tile;
xd->mi = cm->mi_grid_visible + offset;
xd->mi[0] = &cm->mi[offset];
+ // TODO(slavarnway): Generate sb_type based on bwl and bhl, instead of
+ // passing bsize from decode_partition().
xd->mi[0]->mbmi.sb_type = bsize;
for (y = 0; y < y_mis; ++y)
for (x = !y; x < x_mis; ++x) {
xd->mi[y * cm->mi_stride + x] = xd->mi[0];
}
+ set_plane_n4(xd, bw, bh, bwl, bhl);
+
set_skip_context(xd, mi_row, mi_col);
// Distance of Mb to the various image edges. These are specified to 8th pel
@@ -731,10 +880,17 @@
static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
int mi_row, int mi_col,
- vp9_reader *r, BLOCK_SIZE bsize) {
+ vp9_reader *r, BLOCK_SIZE bsize,
+ int bwl, int bhl) {
VP9_COMMON *const cm = &pbi->common;
const int less8x8 = bsize < BLOCK_8X8;
- MB_MODE_INFO *mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col);
+ const int bw = 1 << (bwl - 1);
+ const int bh = 1 << (bhl - 1);
+ const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+ const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+
+ MB_MODE_INFO *mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col,
+ bw, bh, x_mis, y_mis, bwl, bhl);
if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
const BLOCK_SIZE uv_subsize =
@@ -744,28 +900,45 @@
VPX_CODEC_CORRUPT_FRAME, "Invalid block size.");
}
- vp9_read_mode_info(pbi, xd, mi_row, mi_col, r);
-
- if (less8x8)
- bsize = BLOCK_8X8;
+ vp9_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
if (mbmi->skip) {
- reset_skip_context(xd, bsize);
+ dec_reset_skip_context(xd);
}
if (!is_inter_block(mbmi)) {
struct intra_args arg = {xd, r, mbmi->segment_id};
- vp9_foreach_transformed_block(xd, bsize,
+ dec_foreach_transformed_block(xd,
predict_and_reconstruct_intra_block, &arg);
} else {
// Prediction
- dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col, bsize);
+ dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col);
// Reconstruction
if (!mbmi->skip) {
int eobtotal = 0;
- struct inter_args arg = {xd, r, &eobtotal, mbmi->segment_id};
- vp9_foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
+ int plane;
+
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const TX_SIZE tx_size =
+ plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
+ : mbmi->tx_size;
+ const int num_4x4_w = pd->n4_w;
+ const int num_4x4_h = pd->n4_h;
+ const int step = (1 << tx_size);
+ int row, col;
+ const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ?
+ 0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+ const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
+ 0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+ for (row = 0; row < max_blocks_high; row += step)
+ for (col = 0; col < max_blocks_wide; col += step)
+ eobtotal += reconstruct_inter_block(xd, r, mbmi, plane, row, col,
+ tx_size);
+ }
+
if (!less8x8 && eobtotal == 0)
mbmi->skip = 1; // skip loopfilter
}
@@ -774,10 +947,36 @@
xd->corrupted |= vp9_reader_has_error(r);
}
+static INLINE int dec_partition_plane_context(const MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ int bsl) {
+ const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
+ const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+ int above = (*above_ctx >> bsl) & 1 , left = (*left_ctx >> bsl) & 1;
+
+// assert(bsl >= 0);
+
+ return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+}
+
+static INLINE void dec_update_partition_context(MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ BLOCK_SIZE subsize,
+ int bw) {
+ PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
+ PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+
+ // update the partition context at the end notes. set partition bits
+ // of block sizes larger than the current one to be one, and partition
+ // bits of smaller block sizes to be zero.
+ memset(above_ctx, partition_context_lookup[subsize].above, bw);
+ memset(left_ctx, partition_context_lookup[subsize].left, bw);
+}
+
static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
- BLOCK_SIZE bsize, vp9_reader *r,
- int has_rows, int has_cols) {
- const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+ vp9_reader *r,
+ int has_rows, int has_cols, int bsl) {
+ const int ctx = dec_partition_plane_context(xd, mi_row, mi_col, bsl);
const vp9_prob *const probs = get_partition_probs(xd, ctx);
FRAME_COUNTS *counts = xd->counts;
PARTITION_TYPE p;
@@ -797,11 +996,14 @@
return p;
}
+// TODO(slavarnway): eliminate bsize and subsize in future commits
static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd,
int mi_row, int mi_col,
- vp9_reader* r, BLOCK_SIZE bsize) {
+ vp9_reader* r, BLOCK_SIZE bsize, int n4x4_l2) {
VP9_COMMON *const cm = &pbi->common;
- const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+ const int n8x8_l2 = n4x4_l2 - 1;
+ const int num_8x8_wh = 1 << n8x8_l2;
+ const int hbs = num_8x8_wh >> 1;
PARTITION_TYPE partition;
BLOCK_SIZE subsize;
const int has_rows = (mi_row + hbs) < cm->mi_rows;
@@ -810,30 +1012,37 @@
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- partition = read_partition(xd, mi_row, mi_col, bsize, r, has_rows, has_cols);
- subsize = get_subsize(bsize, partition);
- if (bsize == BLOCK_8X8) {
- decode_block(pbi, xd, mi_row, mi_col, r, subsize);
+ partition = read_partition(xd, mi_row, mi_col, r, has_rows, has_cols,
+ n8x8_l2);
+ subsize = subsize_lookup[partition][bsize]; // get_subsize(bsize, partition);
+ if (!hbs) {
+ // calculate bmode block dimensions (log 2)
+ xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
+ xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
+ decode_block(pbi, xd, mi_row, mi_col, r, subsize, 1, 1);
} else {
switch (partition) {
case PARTITION_NONE:
- decode_block(pbi, xd, mi_row, mi_col, r, subsize);
+ decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n4x4_l2);
break;
case PARTITION_HORZ:
- decode_block(pbi, xd, mi_row, mi_col, r, subsize);
+ decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n8x8_l2);
if (has_rows)
- decode_block(pbi, xd, mi_row + hbs, mi_col, r, subsize);
+ decode_block(pbi, xd, mi_row + hbs, mi_col, r, subsize, n4x4_l2,
+ n8x8_l2);
break;
case PARTITION_VERT:
- decode_block(pbi, xd, mi_row, mi_col, r, subsize);
+ decode_block(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2, n4x4_l2);
if (has_cols)
- decode_block(pbi, xd, mi_row, mi_col + hbs, r, subsize);
+ decode_block(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2,
+ n4x4_l2);
break;
case PARTITION_SPLIT:
- decode_partition(pbi, xd, mi_row, mi_col, r, subsize);
- decode_partition(pbi, xd, mi_row, mi_col + hbs, r, subsize);
- decode_partition(pbi, xd, mi_row + hbs, mi_col, r, subsize);
- decode_partition(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize);
+ decode_partition(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2);
+ decode_partition(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2);
+ decode_partition(pbi, xd, mi_row + hbs, mi_col, r, subsize, n8x8_l2);
+ decode_partition(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize,
+ n8x8_l2);
break;
default:
assert(0 && "Invalid partition type");
@@ -843,7 +1052,7 @@
// update partition context
if (bsize >= BLOCK_8X8 &&
(bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
- update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+ dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
}
static void setup_token_decoder(const uint8_t *data,
@@ -1324,11 +1533,12 @@
tile_data->xd.corrupted = 0;
tile_data->xd.counts = cm->frame_parallel_decoding_mode ?
NULL : &cm->counts;
+ vp9_zero(tile_data->dqcoeff);
vp9_tile_init(&tile_data->xd.tile, tile_data->cm, tile_row, tile_col);
setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
&tile_data->bit_reader, pbi->decrypt_cb,
pbi->decrypt_state);
- vp9_init_macroblockd(cm, &tile_data->xd);
+ vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
}
}
@@ -1346,8 +1556,8 @@
vp9_zero(tile_data->xd.left_seg_context);
for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
mi_col += MI_BLOCK_SIZE) {
- decode_partition(pbi, &tile_data->xd, mi_row, mi_col,
- &tile_data->bit_reader, BLOCK_64X64);
+ decode_partition(pbi, &tile_data->xd, mi_row,
+ mi_col, &tile_data->bit_reader, BLOCK_64X64, 4);
}
pbi->mb.corrupted |= tile_data->xd.corrupted;
if (pbi->mb.corrupted)
@@ -1421,7 +1631,7 @@
mi_col += MI_BLOCK_SIZE) {
decode_partition(tile_data->pbi, &tile_data->xd,
mi_row, mi_col, &tile_data->bit_reader,
- BLOCK_64X64);
+ BLOCK_64X64, 4);
}
}
return !tile_data->xd.corrupted;
@@ -1543,12 +1753,13 @@
tile_data->xd.corrupted = 0;
tile_data->xd.counts = cm->frame_parallel_decoding_mode ?
0 : &tile_data->counts;
+ vp9_zero(tile_data->dqcoeff);
vp9_tile_init(tile, cm, 0, buf->col);
vp9_tile_init(&tile_data->xd.tile, cm, 0, buf->col);
setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
&tile_data->bit_reader, pbi->decrypt_cb,
pbi->decrypt_state);
- vp9_init_macroblockd(cm, &tile_data->xd);
+ vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
worker->had_error = 0;
if (i == num_workers - 1 || n == tile_cols - 1) {
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 0d08f8c..d42a654 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -91,41 +91,44 @@
return MIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
}
-static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
- int mi_row, int mi_col, int segment_id) {
- const int mi_offset = mi_row * cm->mi_cols + mi_col;
- const int bw = num_8x8_blocks_wide_lookup[bsize];
- const int bh = num_8x8_blocks_high_lookup[bsize];
- const int xmis = MIN(cm->mi_cols - mi_col, bw);
- const int ymis = MIN(cm->mi_rows - mi_row, bh);
+static int dec_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids,
+ int mi_offset, int x_mis, int y_mis) {
+ int x, y, segment_id = INT_MAX;
+
+ for (y = 0; y < y_mis; y++)
+ for (x = 0; x < x_mis; x++)
+ segment_id = MIN(segment_id,
+ segment_ids[mi_offset + y * cm->mi_cols + x]);
+
+ assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+ return segment_id;
+}
+
+static void set_segment_id(VP9_COMMON *cm, int mi_offset,
+ int x_mis, int y_mis, int segment_id) {
int x, y;
assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
- for (y = 0; y < ymis; y++)
- for (x = 0; x < xmis; x++)
+ for (y = 0; y < y_mis; y++)
+ for (x = 0; x < x_mis; x++)
cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
}
static void copy_segment_id(const VP9_COMMON *cm,
const uint8_t *last_segment_ids,
uint8_t *current_segment_ids,
- BLOCK_SIZE bsize, int mi_row, int mi_col) {
- const int mi_offset = mi_row * cm->mi_cols + mi_col;
- const int bw = num_8x8_blocks_wide_lookup[bsize];
- const int bh = num_8x8_blocks_high_lookup[bsize];
- const int xmis = MIN(cm->mi_cols - mi_col, bw);
- const int ymis = MIN(cm->mi_rows - mi_row, bh);
+ int mi_offset, int x_mis, int y_mis) {
int x, y;
- for (y = 0; y < ymis; y++)
- for (x = 0; x < xmis; x++)
+ for (y = 0; y < y_mis; y++)
+ for (x = 0; x < x_mis; x++)
current_segment_ids[mi_offset + y * cm->mi_cols + x] = last_segment_ids ?
last_segment_ids[mi_offset + y * cm->mi_cols + x] : 0;
}
-static int read_intra_segment_id(VP9_COMMON *const cm, BLOCK_SIZE bsize,
- int mi_row, int mi_col,
+static int read_intra_segment_id(VP9_COMMON *const cm, int mi_offset,
+ int x_mis, int y_mis,
vp9_reader *r) {
struct segmentation *const seg = &cm->seg;
int segment_id;
@@ -135,12 +138,12 @@
if (!seg->update_map) {
copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
- bsize, mi_row, mi_col);
+ mi_offset, x_mis, y_mis);
return 0;
}
segment_id = read_segment_id(r, seg);
- set_segment_id(cm, bsize, mi_row, mi_col, segment_id);
+ set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
return segment_id;
}
@@ -148,18 +151,25 @@
int mi_row, int mi_col, vp9_reader *r) {
struct segmentation *const seg = &cm->seg;
MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
- const BLOCK_SIZE bsize = mbmi->sb_type;
int predicted_segment_id, segment_id;
+ const int mi_offset = mi_row * cm->mi_cols + mi_col;
+ const int bw = xd->plane[0].n4_w >> 1;
+ const int bh = xd->plane[0].n4_h >> 1;
+
+ // TODO(slavarnway): move x_mis, y_mis into xd ?????
+ const int x_mis = MIN(cm->mi_cols - mi_col, bw);
+ const int y_mis = MIN(cm->mi_rows - mi_row, bh);
if (!seg->enabled)
return 0; // Default for disabled segmentation
predicted_segment_id = cm->last_frame_seg_map ?
- get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col) : 0;
+ dec_get_segment_id(cm, cm->last_frame_seg_map, mi_offset, x_mis, y_mis) :
+ 0;
if (!seg->update_map) {
copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
- bsize, mi_row, mi_col);
+ mi_offset, x_mis, y_mis);
return predicted_segment_id;
}
@@ -171,7 +181,7 @@
} else {
segment_id = read_segment_id(r, seg);
}
- set_segment_id(cm, bsize, mi_row, mi_col, segment_id);
+ set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
return segment_id;
}
@@ -198,8 +208,15 @@
const MODE_INFO *left_mi = xd->left_mi;
const BLOCK_SIZE bsize = mbmi->sb_type;
int i;
+ const int mi_offset = mi_row * cm->mi_cols + mi_col;
+ const int bw = xd->plane[0].n4_w >> 1;
+ const int bh = xd->plane[0].n4_h >> 1;
- mbmi->segment_id = read_intra_segment_id(cm, bsize, mi_row, mi_col, r);
+ // TODO(slavarnway): move x_mis, y_mis into xd ?????
+ const int x_mis = MIN(cm->mi_cols - mi_col, bw);
+ const int y_mis = MIN(cm->mi_rows - mi_row, bh);
+
+ mbmi->segment_id = read_intra_segment_id(cm, mi_offset, x_mis, y_mis, r);
mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
mbmi->tx_size = read_tx_size(cm, xd, 1, r);
mbmi->ref_frame[0] = INTRA_FRAME;
@@ -519,8 +536,8 @@
: cm->interp_filter;
if (bsize < BLOCK_8X8) {
- const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; // 1 or 2
- const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; // 1 or 2
+ const int num_4x4_w = 1 << xd->bmode_blocks_wl;
+ const int num_4x4_h = 1 << xd->bmode_blocks_hl;
int idx, idy;
PREDICTION_MODE b_mode;
int_mv nearest_sub8x8[2], near_sub8x8[2];
@@ -589,13 +606,10 @@
}
void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
- int mi_row, int mi_col, vp9_reader *r) {
+ int mi_row, int mi_col, vp9_reader *r,
+ int x_mis, int y_mis) {
VP9_COMMON *const cm = &pbi->common;
MODE_INFO *const mi = xd->mi[0];
- const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
- const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
- const int x_mis = MIN(bw, cm->mi_cols - mi_col);
- const int y_mis = MIN(bh, cm->mi_rows - mi_row);
MV_REF* frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
int w, h;
diff --git a/vp9/decoder/vp9_decodemv.h b/vp9/decoder/vp9_decodemv.h
index db57b40..53bac8c 100644
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h
@@ -19,7 +19,8 @@
#endif
void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
- int mi_row, int mi_col, vp9_reader *r);
+ int mi_row, int mi_col, vp9_reader *r,
+ int x_mis, int y_mis);
#ifdef __cplusplus
} // extern "C"
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 76a8746..af47f85 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -32,6 +32,8 @@
VP9_COMMON *cm;
vp9_reader bit_reader;
DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+ /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+ DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
} TileData;
typedef struct TileWorkerData {
@@ -39,6 +41,8 @@
vp9_reader bit_reader;
FRAME_COUNTS counts;
DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+ /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+ DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
struct vpx_internal_error_info error_info;
} TileWorkerData;
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 4e3ab82..5596c9e 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -205,9 +205,54 @@
return c;
}
+// TODO(slavarnway): Decode version of vp9_set_context. Modify vp9_set_context
+// after testing is complete, then delete this version.
+static
+void dec_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
+ TX_SIZE tx_size, int has_eob,
+ int aoff, int loff) {
+ ENTROPY_CONTEXT *const a = pd->above_context + aoff;
+ ENTROPY_CONTEXT *const l = pd->left_context + loff;
+ const int tx_size_in_blocks = 1 << tx_size;
+
+ // above
+ if (has_eob && xd->mb_to_right_edge < 0) {
+ int i;
+ const int blocks_wide = pd->n4_w +
+ (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+ int above_contexts = tx_size_in_blocks;
+ if (above_contexts + aoff > blocks_wide)
+ above_contexts = blocks_wide - aoff;
+
+ for (i = 0; i < above_contexts; ++i)
+ a[i] = has_eob;
+ for (i = above_contexts; i < tx_size_in_blocks; ++i)
+ a[i] = 0;
+ } else {
+ memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+ }
+
+ // left
+ if (has_eob && xd->mb_to_bottom_edge < 0) {
+ int i;
+ const int blocks_high = pd->n4_h +
+ (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+ int left_contexts = tx_size_in_blocks;
+ if (left_contexts + loff > blocks_high)
+ left_contexts = blocks_high - loff;
+
+ for (i = 0; i < left_contexts; ++i)
+ l[i] = has_eob;
+ for (i = left_contexts; i < tx_size_in_blocks; ++i)
+ l[i] = 0;
+ } else {
+ memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+ }
+}
+
int vp9_decode_block_tokens(MACROBLOCKD *xd,
int plane, const scan_order *sc,
- BLOCK_SIZE plane_bsize, int x, int y,
+ int x, int y,
TX_SIZE tx_size, vp9_reader *r,
int seg_id) {
struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -217,7 +262,7 @@
const int eob = decode_coefs(xd, pd->plane_type,
pd->dqcoeff, tx_size,
dequant, ctx, sc->scan, sc->neighbors, r);
- vp9_set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y);
+ dec_set_contexts(xd, pd, tx_size, eob > 0, x, y);
return eob;
}
diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index ed826a4..cf0e48a 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -22,7 +22,7 @@
int vp9_decode_block_tokens(MACROBLOCKD *xd,
int plane, const scan_order *sc,
- BLOCK_SIZE plane_bsize, int x, int y,
+ int x, int y,
TX_SIZE tx_size, vp9_reader *r,
int seg_id);
diff --git a/vp9/decoder/vp9_reader.c b/vp9/decoder/vp9_reader.c
index 2c96f74..9a46cd7 100644
--- a/vp9/decoder/vp9_reader.c
+++ b/vp9/decoder/vp9_reader.c
@@ -7,12 +7,24 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <stdlib.h>
#include "vpx_ports/mem.h"
#include "vpx_mem/vpx_mem.h"
+#include "./vpx_config.h"
#include "vp9/decoder/vp9_reader.h"
+#include "vpx_util/endian_inl.h"
+
+#if CONFIG_BIG_ENDIAN
+#define BIGENDIFY64(X) (X)
+#define BIGENDIFY32(X) (X)
+#else
+#define BIGENDIFY64(X) BSwap64(X)
+#define BIGENDIFY32(X) BSwap32(X)
+#endif
+
int vp9_reader_init(vp9_reader *r,
const uint8_t *buffer,
size_t size,
@@ -39,11 +51,9 @@
const uint8_t *buffer_start = buffer;
BD_VALUE value = r->value;
int count = r->count;
- int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
- int loop_end = 0;
const size_t bytes_left = buffer_end - buffer;
const size_t bits_left = bytes_left * CHAR_BIT;
- const int x = (int)(shift + CHAR_BIT - bits_left);
+ int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
if (r->decrypt_cb) {
size_t n = MIN(sizeof(r->clear_buffer), bytes_left);
@@ -51,17 +61,31 @@
buffer = r->clear_buffer;
buffer_start = r->clear_buffer;
}
+ if (bits_left > BD_VALUE_SIZE) {
+#if UINTPTR_MAX == 0xffffffffffffffff
+ BD_VALUE big_endian_values = BIGENDIFY64(*((const BD_VALUE *) buffer));
+#else
+ BD_VALUE big_endian_values = BIGENDIFY32(*((const BD_VALUE *) buffer));
+#endif
+ const int bits = (shift & 0xfffffff8) + CHAR_BIT;
+ const BD_VALUE nv = big_endian_values >> (BD_VALUE_SIZE - bits);
+ count += bits;
+ buffer += (bits >> 3);
+ value = r->value | (nv << (shift & 0x7));
+ } else {
+ const int bits_over = (int)(shift + CHAR_BIT - bits_left);
+ int loop_end = 0;
+ if (bits_over >= 0) {
+ count += LOTS_OF_BITS;
+ loop_end = bits_over;
+ }
- if (x >= 0) {
- count += LOTS_OF_BITS;
- loop_end = x;
- }
-
- if (x < 0 || bits_left) {
- while (shift >= loop_end) {
- count += CHAR_BIT;
- value |= (BD_VALUE)*buffer++ << shift;
- shift -= CHAR_BIT;
+ if (bits_over < 0 || bits_left) {
+ while (shift >= loop_end) {
+ count += CHAR_BIT;
+ value |= (BD_VALUE)*buffer++ << shift;
+ shift -= CHAR_BIT;
+ }
}
}
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 02d986e..a450945 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -194,7 +194,6 @@
set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
-
mbmi = &xd->mi[0]->mbmi;
// Set up destination pointers.
@@ -2123,38 +2122,6 @@
BLOCK_64X64
};
-// Checks to see if a macro block is at the edge of the active image.
-// In most cases this is the "real" edge unless there are formatting
-// bars embedded in the stream.
-static int active_edge_sb(VP9_COMP *cpi,
- int mi_row, int mi_col) {
- int is_active_edge = 0;
- int top_edge = 0;
- int bottom_edge = cpi->common.mi_rows;
- int left_edge = 0;
- int right_edge = cpi->common.mi_cols;
-
- // For two pass account for any formatting bars detected.
- if (cpi->oxcf.pass == 2) {
- TWO_PASS *twopass = &cpi->twopass;
-
- // The inactive region is specified in MBs not mi units.
- // The image edge is in the following MB row.
- top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
-
- bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
- bottom_edge = MAX(top_edge, bottom_edge);
- }
-
- if (((top_edge >= mi_row) && (top_edge < (mi_row + MI_BLOCK_SIZE))) ||
- ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + MI_BLOCK_SIZE))) ||
- ((left_edge >= mi_col) && (left_edge < (mi_col + MI_BLOCK_SIZE))) ||
- ((right_edge >= mi_col) && (right_edge < (mi_col + MI_BLOCK_SIZE)))) {
- is_active_edge = 1;
- }
-
- return is_active_edge;
-}
// Look at all the mode_info entries for blocks that are part of this
// partition and find the min and max values for sb_type.
@@ -2253,7 +2220,7 @@
// Test for blocks at the edge of the active image.
// This may be the actual edge of the image or where there are formatting
// bars.
- if (active_edge_sb(cpi, mi_row, mi_col)) {
+ if (vp9_active_edge_sb(cpi, mi_row, mi_col)) {
min_size = BLOCK_4X4;
} else {
min_size = MIN(cpi->sf.rd_auto_partition_min_limit,
@@ -2674,8 +2641,9 @@
}
// PARTITION_HORZ
- if (partition_horz_allowed && do_rect) {
- subsize = get_subsize(bsize, PARTITION_HORZ);
+ if (partition_horz_allowed &&
+ (do_rect || vp9_active_h_edge(cpi, mi_row, mi_step))) {
+ subsize = get_subsize(bsize, PARTITION_HORZ);
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, ctx);
if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
@@ -2721,8 +2689,9 @@
restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
}
// PARTITION_VERT
- if (partition_vert_allowed && do_rect) {
- subsize = get_subsize(bsize, PARTITION_VERT);
+ if (partition_vert_allowed &&
+ (do_rect || vp9_active_v_edge(cpi, mi_col, mi_step))) {
+ subsize = get_subsize(bsize, PARTITION_VERT);
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, ctx);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 3130941..659ce72 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -763,7 +763,7 @@
}
void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, void *arg) {
+ TX_SIZE tx_size, void *arg) {
struct encode_b_args* const args = arg;
MACROBLOCK *const x = args->x;
MACROBLOCKD *const xd = &x->e_mbd;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index f27f57a..6c595b7 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -732,7 +732,7 @@
vp9_set_mb_mi(cm, cm->width, cm->height);
vp9_init_context_buffers(cm);
- vp9_init_macroblockd(cm, xd);
+ vp9_init_macroblockd(cm, xd, NULL);
cpi->td.mb.mbmi_ext_base = cpi->mbmi_ext_base;
memset(cpi->mbmi_ext_base, 0,
cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index a85b70b..61279f8 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -126,6 +126,7 @@
stats->pcnt_neutral,
stats->intra_skip_pct,
stats->inactive_zone_rows,
+ stats->inactive_zone_cols,
stats->MVr,
stats->mvr_abs,
stats->MVc,
@@ -164,6 +165,7 @@
section->pcnt_neutral = 0.0;
section->intra_skip_pct = 0.0;
section->inactive_zone_rows = 0.0;
+ section->inactive_zone_cols = 0.0;
section->MVr = 0.0;
section->mvr_abs = 0.0;
section->MVc = 0.0;
@@ -191,6 +193,7 @@
section->pcnt_neutral += frame->pcnt_neutral;
section->intra_skip_pct += frame->intra_skip_pct;
section->inactive_zone_rows += frame->inactive_zone_rows;
+ section->inactive_zone_cols += frame->inactive_zone_cols;
section->MVr += frame->MVr;
section->mvr_abs += frame->mvr_abs;
section->MVc += frame->MVc;
@@ -216,6 +219,7 @@
section->pcnt_neutral -= frame->pcnt_neutral;
section->intra_skip_pct -= frame->intra_skip_pct;
section->inactive_zone_rows -= frame->inactive_zone_rows;
+ section->inactive_zone_cols -= frame->inactive_zone_cols;
section->MVr -= frame->MVr;
section->mvr_abs -= frame->mvr_abs;
section->MVc -= frame->MVc;
@@ -1050,6 +1054,7 @@
fps.pcnt_neutral = (double)neutral_count / num_mbs;
fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
fps.inactive_zone_rows = (double)image_data_start_row;
+ fps.inactive_zone_cols = (double)0; // TODO(paulwilkins): fix
if (mvcount > 0) {
fps.MVr = (double)sum_mvr / mvcount;
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 0047932..49f9da3 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -53,6 +53,7 @@
double pcnt_neutral;
double intra_skip_pct;
double inactive_zone_rows; // Image mask rows top and bottom.
+ double inactive_zone_cols; // Image mask columns at left and right edges.
double MVr;
double mvr_abs;
double MVc;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index e6e17c0..32c1f76 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -57,17 +57,14 @@
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
- const int rc = 0;
- const int coeff = coeff_ptr[rc];
+ const int coeff = coeff_ptr[0];
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
- const int64_t tmp =
- (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) *
- quant) >> 16;
- qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
- dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
- if (tmp)
+ const int64_t tmp = abs_coeff + round_ptr[0];
+ const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16);
+ qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
+ if (abs_qcoeff)
eob = 0;
}
*eob_ptr = eob + 1;
@@ -89,7 +86,6 @@
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
-
tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
INT16_MIN, INT16_MAX);
tmp = (tmp * quant) >> 15;
@@ -117,17 +113,14 @@
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
- const int rc = 0;
- const int coeff = coeff_ptr[rc];
+ const int coeff = coeff_ptr[0];
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
- const int64_t tmp =
- (clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
- INT32_MIN, INT32_MAX) * quant) >> 15;
- qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
- dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
- if (tmp)
+ const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
+ const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15);
+ qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
+ if (abs_qcoeff)
eob = 0;
}
*eob_ptr = eob + 1;
@@ -207,15 +200,11 @@
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
- const int64_t tmp =
- (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) *
- quant_ptr[rc != 0]) >> 16;
-
- qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
+ const int64_t tmp = abs_coeff + round_ptr[rc != 0];
+ const uint32_t abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 16);
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-
- if (tmp)
+ if (abs_qcoeff)
eob = i;
}
}
@@ -287,21 +276,21 @@
if (!skip_block) {
for (i = 0; i < n_coeffs; i++) {
+ uint32_t abs_qcoeff = 0;
const int rc = scan[i];
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
- int64_t tmp = 0;
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
- tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
- INT32_MIN, INT32_MAX);
- tmp = (tmp * quant_ptr[rc != 0]) >> 15;
- qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
+ const int64_t tmp = abs_coeff
+ + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ abs_qcoeff = (uint32_t) ((tmp * quant_ptr[rc != 0]) >> 15);
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
}
- if (tmp)
+ if (abs_qcoeff)
eob = i;
}
}
@@ -398,14 +387,13 @@
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
if (abs_coeff >= zbins[rc != 0]) {
- int64_t tmp = clamp(abs_coeff + round_ptr[rc != 0],
- INT32_MIN, INT32_MAX);
- tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
- quant_shift_ptr[rc != 0]) >> 16; // quantization
- qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
+ const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-
- if (tmp)
+ if (abs_qcoeff)
eob = i;
}
}
@@ -513,16 +501,14 @@
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- int64_t tmp = clamp(abs_coeff +
- ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
- INT32_MIN, INT32_MAX);
- tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
- quant_shift_ptr[rc != 0]) >> 15;
-
- qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
+ const int64_t tmp1 = abs_coeff
+ + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-
- if (tmp)
+ if (abs_qcoeff)
eob = idx_arr[i];
}
}
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 3a27e89..9985f89 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2898,6 +2898,77 @@
*this_rd += (*this_rd * var_factor) / 100;
}
+
+// Do we have an internal image edge (e.g. formatting bars).
+int vp9_internal_image_edge(VP9_COMP *cpi) {
+ return (cpi->oxcf.pass == 2) &&
+ ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
+ (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
+}
+
+// Checks to see if a super block is on a horizontal image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_h_edge(VP9_COMP *cpi, int mi_row, int mi_step) {
+ int top_edge = 0;
+ int bottom_edge = cpi->common.mi_rows;
+ int is_active_h_edge = 0;
+
+ // For two pass account for any formatting bars detected.
+ if (cpi->oxcf.pass == 2) {
+ TWO_PASS *twopass = &cpi->twopass;
+
+ // The inactive region is specified in MBs not mi units.
+ // The image edge is in the following MB row.
+ top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+
+ bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+ bottom_edge = MAX(top_edge, bottom_edge);
+ }
+
+ if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
+ ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
+ is_active_h_edge = 1;
+ }
+ return is_active_h_edge;
+}
+
+// Checks to see if a super block is on a vertical image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_v_edge(VP9_COMP *cpi, int mi_col, int mi_step) {
+ int left_edge = 0;
+ int right_edge = cpi->common.mi_cols;
+ int is_active_v_edge = 0;
+
+ // For two pass account for any formatting bars detected.
+ if (cpi->oxcf.pass == 2) {
+ TWO_PASS *twopass = &cpi->twopass;
+
+ // The inactive region is specified in MBs not mi units.
+ // The image edge is in the following MB row.
+ left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+
+ right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+ right_edge = MAX(left_edge, right_edge);
+ }
+
+ if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
+ ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
+ is_active_v_edge = 1;
+ }
+ return is_active_v_edge;
+}
+
+// Checks to see if a super block is at the edge of the active image.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_edge_sb(VP9_COMP *cpi,
+ int mi_row, int mi_col) {
+ return vp9_active_h_edge(cpi, mi_row, MI_BLOCK_SIZE) ||
+ vp9_active_v_edge(cpi, mi_col, MI_BLOCK_SIZE);
+}
+
void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
TileDataEnc *tile_data,
MACROBLOCK *x,
@@ -3751,13 +3822,15 @@
int skip_uv;
PREDICTION_MODE mode_uv = DC_PRED;
const int intra_cost_penalty = vp9_get_intra_cost_penalty(
- cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+ cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
int_mv seg_mvs[4][MAX_REF_FRAMES];
b_mode_info best_bmodes[4];
int best_skip2 = 0;
int ref_frame_skip_mask[2] = { 0 };
int64_t mask_filter = 0;
int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
+ int internal_active_edge =
+ vp9_active_edge_sb(cpi, mi_row, mi_col) && vp9_internal_image_edge(cpi);
x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
memset(x->zcoeff_blk[TX_4X4], 0, 4);
@@ -3843,7 +3916,8 @@
continue;
// Test best rd so far against threshold for trying this mode.
- if (rd_less_than_thresh(best_rd,
+ if (!internal_active_edge &&
+ rd_less_than_thresh(best_rd,
rd_opt->threshes[segment_id][bsize][ref_index],
tile_data->thresh_freq_fact[bsize][ref_index]))
continue;
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 459b032..00ee55c 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -54,6 +54,11 @@
PICK_MODE_CONTEXT *ctx,
int64_t best_rd_so_far);
+int vp9_internal_image_edge(struct VP9_COMP *cpi);
+int vp9_active_h_edge(struct VP9_COMP *cpi, int mi_row, int mi_step);
+int vp9_active_v_edge(struct VP9_COMP *cpi, int mi_col, int mi_step);
+int vp9_active_edge_sb(struct VP9_COMP *cpi, int mi_row, int mi_col);
+
void vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
struct TileDataEnc *tile_data,
struct macroblock *x,
@@ -61,6 +66,7 @@
struct RD_COST *rd_cost,
BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
int64_t best_rd_so_far);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 6c6c4ed..e544f9b 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -12,6 +12,8 @@
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_speed_features.h"
+#include "vp9/encoder/vp9_rdopt.h"
+
// Intra only frames, golden frames (except alt ref overlays) and
// alt ref frames tend to be coded at a higher than ambient quality
@@ -89,8 +91,10 @@
// If this is a two pass clip that fits the criteria for animated or
// graphics content then reset disable_split_mask for speeds 1-4.
+ // Also if the image edge is internal to the coded area.
if ((speed >= 1) && (cpi->oxcf.pass == 2) &&
- (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)) {
+ ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+ (vp9_internal_image_edge(cpi)))) {
sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
}
@@ -112,7 +116,12 @@
sf->allow_skip_recode = 1;
if (speed >= 1) {
- sf->use_square_partition_only = !frame_is_intra_only(cm);
+ if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+ vp9_internal_image_edge(cpi)) {
+ sf->use_square_partition_only = frame_is_boosted(cpi);
+ } else {
+ sf->use_square_partition_only = !frame_is_intra_only(cm);
+ }
sf->less_rectangular_check = 1;
sf->use_rd_breakout = 1;
@@ -152,6 +161,7 @@
}
if (speed >= 3) {
+ sf->use_square_partition_only = !frame_is_intra_only(cm);
sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
: USE_LARGESTALL;
sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h b/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h
index 003ebd1..5074d31 100644
--- a/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h
+++ b/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h
@@ -28,7 +28,8 @@
temp_in[j] = intermediate[j * 32 + i];
vp9_fdct32(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
- out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ out[j + i * 32] =
+ (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
}
}
#define HIGH_FDCT32x32_2D_C vp9_highbd_fdct32x32_c
@@ -42,7 +43,7 @@
temp_in[j] = intermediate[j * 32 + i];
vp9_fdct32(temp_in, temp_out, 1);
for (j = 0; j < 32; ++j)
- out[j + i * 32] = temp_out[j];
+ out[j + i * 32] = (tran_low_t)temp_out[j];
}
}
#define HIGH_FDCT32x32_2D_C vp9_highbd_fdct32x32_rd_c
diff --git a/vp9/encoder/x86/vp9_dct_sse2_impl.h b/vp9/encoder/x86/vp9_dct_sse2_impl.h
index 11bf5a2..86e9ecf 100644
--- a/vp9/encoder/x86/vp9_dct_sse2_impl.h
+++ b/vp9/encoder/x86/vp9_dct_sse2_impl.h
@@ -40,35 +40,35 @@
// These are the coefficients used for the multiplies.
// In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
// where cospi_N_64 = cos(N pi /64)
- const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64,
+ const __m128i k__cospi_A = octa_set_epi16(cospi_16_64, cospi_16_64,
cospi_16_64, cospi_16_64,
cospi_16_64, -cospi_16_64,
cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
+ const __m128i k__cospi_B = octa_set_epi16(cospi_16_64, -cospi_16_64,
cospi_16_64, -cospi_16_64,
cospi_16_64, cospi_16_64,
cospi_16_64, cospi_16_64);
- const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64,
+ const __m128i k__cospi_C = octa_set_epi16(cospi_8_64, cospi_24_64,
cospi_8_64, cospi_24_64,
cospi_24_64, -cospi_8_64,
cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
+ const __m128i k__cospi_D = octa_set_epi16(cospi_24_64, -cospi_8_64,
cospi_24_64, -cospi_8_64,
cospi_8_64, cospi_24_64,
cospi_8_64, cospi_24_64);
- const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64,
+ const __m128i k__cospi_E = octa_set_epi16(cospi_16_64, cospi_16_64,
cospi_16_64, cospi_16_64,
cospi_16_64, cospi_16_64,
cospi_16_64, cospi_16_64);
- const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
+ const __m128i k__cospi_F = octa_set_epi16(cospi_16_64, -cospi_16_64,
cospi_16_64, -cospi_16_64,
cospi_16_64, -cospi_16_64,
cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64,
+ const __m128i k__cospi_G = octa_set_epi16(cospi_8_64, cospi_24_64,
cospi_8_64, cospi_24_64,
-cospi_8_64, -cospi_24_64,
-cospi_8_64, -cospi_24_64);
- const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
+ const __m128i k__cospi_H = octa_set_epi16(cospi_24_64, -cospi_8_64,
cospi_24_64, -cospi_8_64,
-cospi_24_64, cospi_8_64,
-cospi_24_64, cospi_8_64);
@@ -267,7 +267,7 @@
// When we use them, in one case, they are all the same. In all others
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -588,7 +588,7 @@
// When we use them, in one case, they are all the same. In all others
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
diff --git a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c b/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
index cbdd1c9..0174cfe 100644
--- a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
@@ -85,13 +85,13 @@
for (j = 0; j < 4; j++) {
if (test & (1 << (4 * j))) {
int k = 4 * i + j;
- int64_t tmp = clamp(abs_coeff[j] + round_ptr[k != 0],
- INT32_MIN, INT32_MAX);
- tmp = ((((tmp * quant_ptr[k != 0]) >> 16) + tmp) *
- quant_shift_ptr[k != 0]) >> 16; // quantization
- qcoeff_ptr[k] = (tmp ^ coeff_sign[j]) - coeff_sign[j];
+ const int64_t tmp1 = abs_coeff[j] + round_ptr[k != 0];
+ const int64_t tmp2 = ((tmp1 * quant_ptr[k != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16);
+ qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
- if (tmp)
+ if (abs_qcoeff)
eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
}
}
@@ -162,17 +162,15 @@
const int rc = idx_arr[i];
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
- int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- int64_t tmp = clamp(abs_coeff +
- ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
- INT32_MIN, INT32_MAX);
- tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
- quant_shift_ptr[rc != 0]) >> 15;
-
- qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 = abs_coeff
+ + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+ qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-
- if (tmp)
+ if (abs_qcoeff)
eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
}
}
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 2e43c27..bcab558 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -75,6 +75,7 @@
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm
diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index fe35c1e..b45331c 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -50,7 +50,7 @@
high_variance_fn_t var_fn, int block_size) {
int i, j;
uint64_t sse_long = 0;
- int64_t sum_long = 0;
+ int32_t sum_long = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
@@ -63,7 +63,7 @@
}
}
*sum = ROUND_POWER_OF_TWO(sum_long, 2);
- *sse = ROUND_POWER_OF_TWO(sse_long, 4);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
}
static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
@@ -72,7 +72,7 @@
high_variance_fn_t var_fn, int block_size) {
int i, j;
uint64_t sse_long = 0;
- int64_t sum_long = 0;
+ int32_t sum_long = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
@@ -85,7 +85,7 @@
}
}
*sum = ROUND_POWER_OF_TWO(sum_long, 4);
- *sse = ROUND_POWER_OF_TWO(sse_long, 8);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
}
@@ -386,7 +386,7 @@
} \
} \
se = ROUND_POWER_OF_TWO(se, 4); \
- sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
@@ -555,7 +555,7 @@
} \
} \
se = ROUND_POWER_OF_TWO(se, 4); \
- sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
diff --git a/vpx_util/endian_inl.h b/vpx_util/endian_inl.h
new file mode 100644
index 0000000..91753a6
--- /dev/null
+++ b/vpx_util/endian_inl.h
@@ -0,0 +1,113 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Endian related functions.
+
+#ifndef VPX_UTIL_ENDIAN_INL_H_
+#define VPX_UTIL_ENDIAN_INL_H_
+
+#include <stdlib.h>
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#if defined(__GNUC__)
+# define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
+# define LOCAL_GCC_PREREQ(maj, min) \
+ (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_GCC_VERSION 0
+# define LOCAL_GCC_PREREQ(maj, min) 0
+#endif
+
+#ifdef __clang__
+# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
+# define LOCAL_CLANG_PREREQ(maj, min) \
+ (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_CLANG_VERSION 0
+# define LOCAL_CLANG_PREREQ(maj, min) 0
+#endif // __clang__
+
+// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
+#if !defined(WORDS_BIGENDIAN) && \
+ (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
+ (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
+#define WORDS_BIGENDIAN
+#endif
+
+#if defined(WORDS_BIGENDIAN)
+#define HToLE32 BSwap32
+#define HToLE16 BSwap16
+#else
+#define HToLE32(x) (x)
+#define HToLE16(x) (x)
+#endif
+
+// clang-3.3 and gcc-4.3 have builtin functions for swap32/swap64
+#if LOCAL_GCC_PREREQ(4, 3) || LOCAL_CLANG_PREREQ(3, 3)
+#define HAVE_BUILTIN_BSWAP32
+#define HAVE_BUILTIN_BSWAP64
+#endif
+// clang-3.3 and gcc-4.8 have a builtin function for swap16
+#if LOCAL_GCC_PREREQ(4, 8) || LOCAL_CLANG_PREREQ(3, 3)
+#define HAVE_BUILTIN_BSWAP16
+#endif
+
+static INLINE uint16_t BSwap16(uint16_t x) {
+#if defined(HAVE_BUILTIN_BSWAP16)
+ return __builtin_bswap16(x);
+#elif defined(_MSC_VER)
+ return _byteswap_ushort(x);
+#else
+ // gcc will recognize a 'rorw $8, ...' here:
+ return (x >> 8) | ((x & 0xff) << 8);
+#endif // HAVE_BUILTIN_BSWAP16
+}
+
+static INLINE uint32_t BSwap32(uint32_t x) {
+#if HAVE_MIPS32
+ uint32_t ret;
+ __asm__ volatile (
+ "wsbh %[ret], %[x] \n\t"
+ "rotr %[ret], %[ret], 16 \n\t"
+ : [ret]"=r"(ret)
+ : [x]"r"(x)
+ );
+ return ret;
+#elif defined(HAVE_BUILTIN_BSWAP32)
+ return __builtin_bswap32(x);
+#elif defined(__i386__) || defined(__x86_64__)
+ uint32_t swapped_bytes;
+ __asm__ volatile("bswap %0" : "=r"(swapped_bytes) : "0"(x));
+ return swapped_bytes;
+#elif defined(_MSC_VER)
+ return (uint32_t)_byteswap_ulong(x);
+#else
+ return (x >> 24) | ((x >> 8) & 0xff00) | ((x << 8) & 0xff0000) | (x << 24);
+#endif // HAVE_BUILTIN_BSWAP32
+}
+
+static INLINE uint64_t BSwap64(uint64_t x) {
+#if defined(HAVE_BUILTIN_BSWAP64)
+ return __builtin_bswap64(x);
+#elif defined(__x86_64__)
+ uint64_t swapped_bytes;
+ __asm__ volatile("bswapq %0" : "=r"(swapped_bytes) : "0"(x));
+ return swapped_bytes;
+#elif defined(_MSC_VER)
+ return (uint64_t)_byteswap_uint64(x);
+#else // generic code for swapping 64-bit values (suggested by bdb@)
+ x = ((x & 0xffffffff00000000ull) >> 32) | ((x & 0x00000000ffffffffull) << 32);
+ x = ((x & 0xffff0000ffff0000ull) >> 16) | ((x & 0x0000ffff0000ffffull) << 16);
+ x = ((x & 0xff00ff00ff00ff00ull) >> 8) | ((x & 0x00ff00ff00ff00ffull) << 8);
+ return x;
+#endif // HAVE_BUILTIN_BSWAP64
+}
+
+#endif // VPX_UTIL_ENDIAN_INL_H_
diff --git a/vpx_util/vpx_util.mk b/vpx_util/vpx_util.mk
index 1161125..c0ef8d3 100644
--- a/vpx_util/vpx_util.mk
+++ b/vpx_util/vpx_util.mk
@@ -11,3 +11,4 @@
UTIL_SRCS-yes += vpx_util.mk
UTIL_SRCS-yes += vpx_thread.c
UTIL_SRCS-yes += vpx_thread.h
+UTIL_SRCS-yes += endian_inl.h