Merge "Rework sub8x8 chroma component inter predictor" into nextgenv2
diff --git a/test/vp10_fht16x16_test.cc b/test/vp10_fht16x16_test.cc
index 97f2c02..deccc81 100644
--- a/test/vp10_fht16x16_test.cc
+++ b/test/vp10_fht16x16_test.cc
@@ -132,7 +132,7 @@
ACMRandom rnd(ACMRandom::DeterministicSeed());
int i, j;
const int stride = 16;
- const int num_tests = 200000;
+ const int num_tests = 1000;
for (i = 0; i < num_tests; ++i) {
for (j = 0; j < num_coeffs_; ++j) {
@@ -208,6 +208,7 @@
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 2, 12),
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 3, 10),
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 3, 12),
+#if CONFIG_EXT_TX
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 4, 10),
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 4, 12),
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 5, 10),
@@ -218,7 +219,6 @@
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 7, 12),
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 8, 10),
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 8, 12),
-#if CONFIG_EXT_TX
#endif // CONFIG_EXT_TX
};
INSTANTIATE_TEST_CASE_P(
diff --git a/test/vp10_fht4x4_test.cc b/test/vp10_fht4x4_test.cc
index 1309827..c5a4382 100644
--- a/test/vp10_fht4x4_test.cc
+++ b/test/vp10_fht4x4_test.cc
@@ -132,7 +132,7 @@
ACMRandom rnd(ACMRandom::DeterministicSeed());
int i, j;
const int stride = 4;
- const int num_tests = 200000;
+ const int num_tests = 1000;
const int num_coeffs = 16;
for (i = 0; i < num_tests; ++i) {
diff --git a/test/vp10_fht8x8_test.cc b/test/vp10_fht8x8_test.cc
index 2c33939..da278c4 100644
--- a/test/vp10_fht8x8_test.cc
+++ b/test/vp10_fht8x8_test.cc
@@ -131,7 +131,7 @@
ACMRandom rnd(ACMRandom::DeterministicSeed());
int i, j;
const int stride = 8;
- const int num_tests = 200000;
+ const int num_tests = 1000;
const int num_coeffs = 64;
for (i = 0; i < num_tests; ++i) {
diff --git a/test/vp10_iht4x4_test.cc b/test/vp10_iht4x4_test.cc
index 1cad402..3960b5a 100644
--- a/test/vp10_iht4x4_test.cc
+++ b/test/vp10_iht4x4_test.cc
@@ -15,6 +15,7 @@
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
+#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_ports/mem.h"
namespace {
@@ -34,24 +35,27 @@
// <target optimization function, tx_type, bit_depth>
typedef tuple<IHbdHtFunc, int, int> IHbdHt4x4Param;
-class VP10HighbdInvTrans4x4HT :
- public ::testing::TestWithParam<IHbdHt4x4Param> {
+class VP10HighbdInvTrans4x4HT
+ : public ::testing::TestWithParam<IHbdHt4x4Param> {
public:
virtual ~VP10HighbdInvTrans4x4HT() {}
virtual void SetUp() {
inv_txfm_ = GET_PARAM(0);
- inv_txfm_ref_ = iht4x4_ref;
tx_type_ = GET_PARAM(1);
bit_depth_ = GET_PARAM(2);
num_coeffs_ = 4 * 4;
+ // Note:
+ // Inverse transform input buffer is 32-byte aligned
+ // refer to function void alloc_mode_context() in
+ // vp10/encoder/context_tree.c
coeffs_ = reinterpret_cast<int32_t *>(
- vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
+ vpx_memalign(32, sizeof(coeffs_[0]) * num_coeffs_));
output_ = reinterpret_cast<uint16_t *>(
- vpx_memalign(16, sizeof(uint16_t) * num_coeffs_));
+ vpx_memalign(32, sizeof(output_[0]) * num_coeffs_));
output_ref_ = reinterpret_cast<uint16_t *>(
- vpx_memalign(16, sizeof(uint16_t) * num_coeffs_));
+ vpx_memalign(32, sizeof(output_ref_[0]) * num_coeffs_));
}
virtual void TearDown() {
@@ -65,49 +69,39 @@
void RunBitexactCheck();
private:
+ static int32_t ClampCoeffs(int number, int bit) {
+ const int max = (1 << bit) - 1;
+ const int min = -max;
+ return clamp(number, min, max);
+ }
+
IHbdHtFunc inv_txfm_;
- IHbdHtFunc inv_txfm_ref_;
int tx_type_;
int bit_depth_;
int num_coeffs_;
int32_t *coeffs_;
uint16_t *output_;
uint16_t *output_ref_;
-
- int32_t clamp(int32_t number, int bit) {
- int32_t ret = number;
- const int32_t max = (int32_t)(1 << bit) - 1;
- const int32_t min = -max;
-
- if (number > max) {
- ret = max;
- } else if (number < min) {
- ret = min;
- }
- return ret;
- }
};
void VP10HighbdInvTrans4x4HT::RunBitexactCheck() {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int stride = 4;
const int num_tests = 2000000;
- int i;
- int j;
const uint16_t mask = (1 << bit_depth_) - 1;
- for (i = 0; i < num_tests; ++i) {
- for (j = 0; j < num_coeffs_; ++j) {
- coeffs_[j] = clamp((rnd.Rand16() - rnd.Rand16()) << 2, 18);
+ for (int i = 0; i < num_tests; ++i) {
+ for (int j = 0; j < num_coeffs_; ++j) {
+ coeffs_[j] = ClampCoeffs((rnd.Rand16() - rnd.Rand16()) << 2, 18);
output_ref_[j] = rnd.Rand16() & mask;
output_[j] = output_ref_[j];
}
- inv_txfm_ref_(coeffs_, output_ref_, stride, tx_type_, bit_depth_);
+ iht4x4_ref(coeffs_, output_ref_, stride, tx_type_, bit_depth_);
ASM_REGISTER_STATE_CHECK(inv_txfm_(coeffs_, output_, stride, tx_type_,
bit_depth_));
- for (j = 0; j < num_coeffs_; ++j) {
+ for (int j = 0; j < num_coeffs_; ++j) {
EXPECT_EQ(output_ref_[j], output_[j])
<< "Not bit-exact result at index: " << j
<< "At test block: " << i;
diff --git a/test/vp10_inv_txfm2d_test.cc b/test/vp10_inv_txfm2d_test.cc
index eef95f0..fef4629 100644
--- a/test/vp10_inv_txfm2d_test.cc
+++ b/test/vp10_inv_txfm2d_test.cc
@@ -104,8 +104,6 @@
TX_SIZE tx_size_;
int txfm1d_size_;
int txfm2d_size_;
- Fwd_Txfm2d_Func fwd_txfm_;
- Inv_Txfm2d_Func inv_txfm_;
int16_t* input_;
uint16_t* ref_input_;
int32_t* output_;
diff --git a/vp10/common/idct.c b/vp10/common/idct.c
index a5d50bb..717c914 100644
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c
@@ -1297,7 +1297,8 @@
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
- vp10_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
+ vp10_inv_txfm2d_add_4x4_c(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
break;
case V_DCT:
case H_DCT:
@@ -1336,7 +1337,8 @@
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
- vp10_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
+ vp10_inv_txfm2d_add_8x8_c(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
break;
case V_DCT:
case H_DCT:
@@ -1375,7 +1377,8 @@
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
- vp10_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
+ vp10_inv_txfm2d_add_16x16_c(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
break;
case V_DCT:
case H_DCT:
diff --git a/vp10/common/x86/highbd_inv_txfm_sse4.c b/vp10/common/x86/highbd_inv_txfm_sse4.c
index 0c623df..80d4c4f 100644
--- a/vp10/common/x86/highbd_inv_txfm_sse4.c
+++ b/vp10/common/x86/highbd_inv_txfm_sse4.c
@@ -9,18 +9,17 @@
*/
#include <assert.h>
-#include <smmintrin.h> /* SSE4.1 */
+#include <smmintrin.h> /* SSE4.1 */
#include "./vp10_rtcd.h"
#include "./vpx_config.h"
#include "vp10/common/vp10_inv_txfm2d_cfg.h"
-
static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
- in[0] = _mm_loadu_si128((const __m128i *)(coeff + 0));
- in[1] = _mm_loadu_si128((const __m128i *)(coeff + 4));
- in[2] = _mm_loadu_si128((const __m128i *)(coeff + 8));
- in[3] = _mm_loadu_si128((const __m128i *)(coeff + 12));
+ in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+ in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+ in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+ in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
}
static void idct4x4_sse4_1(__m128i *in, int bit) {
@@ -176,7 +175,7 @@
}
static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
- int flipud, int fliplr, int shift, int bd) {
+ int shift, int bd) {
const __m128i zero = _mm_setzero_si128();
__m128i u0, u1, u2, u3;
__m128i v0, v1, v2, v3;
@@ -213,9 +212,6 @@
_mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
_mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
_mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
-
- (void) flipud;
- (void) fliplr;
}
void vp10_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
@@ -229,28 +225,28 @@
load_buffer_4x4(coeff, in);
idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
- write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ write_buffer_4x4(in, output, stride, -cfg->shift[1], bd);
break;
case ADST_DCT:
cfg = &inv_txfm_2d_cfg_adst_dct_4;
load_buffer_4x4(coeff, in);
idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
- write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ write_buffer_4x4(in, output, stride, -cfg->shift[1], bd);
break;
case DCT_ADST:
cfg = &inv_txfm_2d_cfg_dct_adst_4;
load_buffer_4x4(coeff, in);
iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
- write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ write_buffer_4x4(in, output, stride, -cfg->shift[1], bd);
break;
case ADST_ADST:
cfg = &inv_txfm_2d_cfg_adst_adst_4;
load_buffer_4x4(coeff, in);
iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
- write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+ write_buffer_4x4(in, output, stride, -cfg->shift[1], bd);
break;
default:
assert(0);
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index 67ebe6d..673a9e6 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -659,7 +659,7 @@
}
static INLINE int get_ref_frame_buf_idx(const VP10_COMP *const cpi,
- int ref_frame) {
+ MV_REFERENCE_FRAME ref_frame) {
const VP10_COMMON *const cm = &cpi->common;
const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
@@ -673,6 +673,14 @@
buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf : NULL;
}
+static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref(
+ VP10_COMP *cpi, const MV_REFERENCE_FRAME ref_frame) {
+ // Use up-sampled reference frames.
+ const int buf_idx =
+ cpi->upsampled_ref_idx[get_ref_frame_map_idx(cpi, ref_frame)];
+ return &cpi->upsampled_ref_bufs[buf_idx].buf;
+}
+
static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols) {
// TODO(JBB): double check we can't exceed this token count if we have a
// 32x32 transform crossing a boundary at a multiple of 16.
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 5e88d15..c27c887 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -7619,6 +7619,19 @@
#else
int tmp_rate2 = rate2_nocoeff;
#endif // CONFIG_EXT_INTER
+#if CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+ INTERP_FILTER obmc_interp_filter[2][2] = {
+ {mbmi->interp_filter[0], mbmi->interp_filter[1]}, // obmc == 0
+ {mbmi->interp_filter[0], mbmi->interp_filter[1]} // obmc == 1
+ };
+#else
+ INTERP_FILTER obmc_interp_filter[2] = {
+ mbmi->interp_filter, // obmc == 0
+ mbmi->interp_filter // obmc == 1
+ };
+#endif // CONFIG_DUAL_FILTER
+#endif // CONFIG_EXT_INTERP
if (mbmi->obmc) {
#if CONFIG_EXT_INTER
@@ -7647,6 +7660,21 @@
#else
tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
#endif // CONFIG_EXT_INTER
+#if CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+ if (!has_subpel_mv_component(xd->mi[0], xd, 0))
+ obmc_interp_filter[1][0] = mbmi->interp_filter[0] = EIGHTTAP_REGULAR;
+ if (!has_subpel_mv_component(xd->mi[0], xd, 1))
+ obmc_interp_filter[1][1] = mbmi->interp_filter[1] = EIGHTTAP_REGULAR;
+#else
+ if (!vp10_is_interp_needed(xd))
+ obmc_interp_filter[1] = mbmi->interp_filter = EIGHTTAP_REGULAR;
+#endif // CONFIG_DUAL_FILTER
+ // This is not quite correct with CONFIG_DUAL_FILTER when a filter
+ // is needed in only one direction
+ if (!vp10_is_interp_needed(xd))
+ tmp_rate2 -= rs;
+#endif // CONFIG_EXT_INTERP
vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
#if CONFIG_EXT_INTER
} else {
@@ -7787,6 +7815,14 @@
#if CONFIG_OBMC
tmp_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
if (mbmi->obmc == 0 || (tmp_rd < best_rd)) {
+#if CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+ mbmi->interp_filter[0] = obmc_interp_filter[mbmi->obmc][0];
+ mbmi->interp_filter[1] = obmc_interp_filter[mbmi->obmc][1];
+#else
+ mbmi->interp_filter = obmc_interp_filter[mbmi->obmc];
+#endif // CONFIG_DUAL_FILTER
+#endif // CONFIG_EXT_INTERP
best_mbmi = *mbmi;
best_rd = tmp_rd;
best_rate2 = *rate2;
diff --git a/vp10/encoder/rdopt.h b/vp10/encoder/rdopt.h
index 2ca39a5..be6227b 100644
--- a/vp10/encoder/rdopt.h
+++ b/vp10/encoder/rdopt.h
@@ -90,24 +90,6 @@
int use_fast_coef_casting);
#endif // CONFIG_SUPERTX
-static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref(VP10_COMP *cpi,
- const int ref) {
- // Use up-sampled reference frames.
- int ref_idx = 0;
- if (ref == LAST_FRAME)
-#if CONFIG_EXT_REFS
- ref_idx = cpi->lst_fb_idxes[ref - LAST_FRAME];
-#else
- ref_idx = cpi->lst_fb_idx;
-#endif // CONFIG_EXT_REFS
- else if (ref == GOLDEN_FRAME)
- ref_idx = cpi->gld_fb_idx;
- else if (ref == ALTREF_FRAME)
- ref_idx = cpi->alt_fb_idx;
-
- return &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[ref_idx]].buf;
-}
-
#if CONFIG_OBMC
void calc_target_weighted_pred(VP10_COMMON *cm,
MACROBLOCK *x,
diff --git a/vp10/encoder/x86/highbd_fwd_txfm_sse4.c b/vp10/encoder/x86/highbd_fwd_txfm_sse4.c
index 2ad59cf..8b27f55 100644
--- a/vp10/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/vp10/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -997,6 +997,7 @@
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
+#if CONFIG_EXT_TX
case FLIPADST_DCT:
cfg = &fwd_txfm_2d_cfg_adst_dct_8;
load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]);
@@ -1047,6 +1048,7 @@
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
+#endif // CONFIG_EXT_TX
default:
assert(0);
}
@@ -1893,6 +1895,7 @@
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
+#if CONFIG_EXT_TX
case FLIPADST_DCT:
cfg = &fwd_txfm_2d_cfg_adst_dct_16;
load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]);
@@ -1943,6 +1946,7 @@
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
+#endif // CONFIG_EXT_TX
default:
assert(0);
}
diff --git a/vpx_dsp/fwd_txfm.c b/vpx_dsp/fwd_txfm.c
index a5802e1..4c0d5db 100644
--- a/vpx_dsp/fwd_txfm.c
+++ b/vpx_dsp/fwd_txfm.c
@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/fwd_txfm.h"
void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c
index 402fd9a..533f762 100644
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c
@@ -11,6 +11,7 @@
#include <math.h>
#include <string.h>
+#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/inv_txfm.h"
void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index 46ef646..645a1ab 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -11,6 +11,7 @@
#include <stdlib.h>
#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_ports/mem.h"
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index e4e741a..6426ccc 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/quantize.h"
#include "vpx_mem/vpx_mem.h"
diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
index 4df39df..951af3a 100644
--- a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -10,6 +10,7 @@
#include <immintrin.h> // AVX2
+#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/txfm_common.h"
#define pair256_set_epi16(a, b) \
diff --git a/vpx_dsp/x86/fwd_txfm_sse2.c b/vpx_dsp/x86/fwd_txfm_sse2.c
index e4deeec..3e4f49b 100644
--- a/vpx_dsp/x86/fwd_txfm_sse2.c
+++ b/vpx_dsp/x86/fwd_txfm_sse2.c
@@ -11,6 +11,7 @@
#include <emmintrin.h> // SSE2
#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_dsp/x86/fwd_txfm_sse2.h"