Merge "Code class0 using aom_read() / aom_write()." into nextgenv2
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 53c9dc5..e66826f 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -391,6 +391,9 @@
add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht16x16 sse2 avx2/;
+add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht32x32 avx2/;
+
if (aom_config("CONFIG_EXT_TX") eq "yes") {
add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht4x8 sse2/;
@@ -409,9 +412,6 @@
add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht32x16 sse2/;
-
- add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/av1_fht32x32 avx2/;
}
if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index 63b71a5..221e3cd 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -325,7 +325,6 @@
range_check(output, 16, 16);
}
-#if CONFIG_EXT_TX
static void fdct32(const tran_low_t *input, tran_low_t *output) {
tran_high_t temp;
tran_low_t step[32];
@@ -723,7 +722,6 @@
range_check(output, 32, 18);
}
-#endif // CONFIG_EXT_TX
static void fadst4(const tran_low_t *input, tran_low_t *output) {
tran_high_t x0, x1, x2, x3;
@@ -1809,57 +1807,74 @@
}
#endif // CONFIG_AOM_HIGHBITDEPTH
-#if CONFIG_EXT_TX
+// TODO(luoyi): Adding this function to avoid DCT_DCT overflow.
+// Remove this function after we scale the column txfm output correctly.
+static INLINE int range_check_dct32x32(const int16_t *input, int16_t bound,
+ int size) {
+ int i;
+ for (i = 0; i < size; ++i) {
+ if (abs(input[i]) > bound) return 1;
+ }
+ return 0;
+}
+
void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
int tx_type) {
- if (tx_type == DCT_DCT) {
- aom_fdct32x32_c(input, output, stride);
- } else {
- static const transform_2d FHT[] = {
- { fdct32, fdct32 }, // DCT_DCT
- { fhalfright32, fdct32 }, // ADST_DCT
- { fdct32, fhalfright32 }, // DCT_ADST
- { fhalfright32, fhalfright32 }, // ADST_ADST
- { fhalfright32, fdct32 }, // FLIPADST_DCT
- { fdct32, fhalfright32 }, // DCT_FLIPADST
- { fhalfright32, fhalfright32 }, // FLIPADST_FLIPADST
- { fhalfright32, fhalfright32 }, // ADST_FLIPADST
- { fhalfright32, fhalfright32 }, // FLIPADST_ADST
- { fidtx32, fidtx32 }, // IDTX
- { fdct32, fidtx32 }, // V_DCT
- { fidtx32, fdct32 }, // H_DCT
- { fhalfright32, fidtx32 }, // V_ADST
- { fidtx32, fhalfright32 }, // H_ADST
- { fhalfright32, fidtx32 }, // V_FLIPADST
- { fidtx32, fhalfright32 }, // H_FLIPADST
- };
- const transform_2d ht = FHT[tx_type];
- tran_low_t out[1024];
- int i, j;
- tran_low_t temp_in[32], temp_out[32];
+ static const transform_2d FHT[] = {
+ { fdct32, fdct32 }, // DCT_DCT
+#if CONFIG_EXT_TX
+ { fhalfright32, fdct32 }, // ADST_DCT
+ { fdct32, fhalfright32 }, // DCT_ADST
+ { fhalfright32, fhalfright32 }, // ADST_ADST
+ { fhalfright32, fdct32 }, // FLIPADST_DCT
+ { fdct32, fhalfright32 }, // DCT_FLIPADST
+ { fhalfright32, fhalfright32 }, // FLIPADST_FLIPADST
+ { fhalfright32, fhalfright32 }, // ADST_FLIPADST
+ { fhalfright32, fhalfright32 }, // FLIPADST_ADST
+ { fidtx32, fidtx32 }, // IDTX
+ { fdct32, fidtx32 }, // V_DCT
+ { fidtx32, fdct32 }, // H_DCT
+ { fhalfright32, fidtx32 }, // V_ADST
+ { fidtx32, fhalfright32 }, // H_ADST
+ { fhalfright32, fidtx32 }, // V_FLIPADST
+ { fidtx32, fhalfright32 }, // H_FLIPADST
+#endif
+ };
+ const transform_2d ht = FHT[tx_type];
+ tran_low_t out[1024];
+ int i, j;
+ tran_low_t temp_in[32], temp_out[32];
- int16_t flipped_input[32 * 32];
- maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type);
+#if CONFIG_EXT_TX
+ int16_t flipped_input[32 * 32];
+ maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type);
+#endif
- // Columns
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
- ht.cols(temp_in, temp_out);
- for (j = 0; j < 32; ++j)
- out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ if (DCT_DCT == tx_type) {
+ if (range_check_dct32x32(input, (1 << 6) - 1, 1 << 10)) {
+ aom_fdct32x32_c(input, output, stride);
+ return;
}
+ }
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < 32; ++j)
+ out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ }
- // Rows
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
- ht.rows(temp_in, temp_out);
- for (j = 0; j < 32; ++j)
- output[j + i * 32] =
- (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
- }
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < 32; ++j)
+ output[j + i * 32] =
+ (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
}
}
+#if CONFIG_EXT_TX
// Forward identity transform.
void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
int bs, int tx_type) {
diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index 1103c4b..6d5eccd 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -21,7 +21,7 @@
if (rd_transform)
aom_fdct32x32_rd(src, dst, src_stride);
else
- aom_fdct32x32(src, dst, src_stride);
+ av1_fht32x32(src, dst, src_stride, DCT_DCT);
}
static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
diff --git a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
index 69bf89a..928af13 100644
--- a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
@@ -198,8 +198,8 @@
in[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
}
-static void load_buffer_16x16(const int16_t *input, int stride, int flipud,
- int fliplr, __m256i *in) {
+static INLINE void load_buffer_16x16(const int16_t *input, int stride,
+ int flipud, int fliplr, __m256i *in) {
if (!flipud) {
in[0] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride));
in[1] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride));
@@ -1273,7 +1273,6 @@
_mm256_zeroupper();
}
-#if CONFIG_EXT_TX
static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) {
int i = 0;
__m256i temp;
@@ -1622,7 +1621,6 @@
mm256_transpose_32x32(in0, in1);
}
-#endif // CONFIG_EXT_TX
static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1,
int stride, tran_low_t *output) {
@@ -1667,9 +1665,11 @@
mm256_vectors_swap(in1, &in1[16], 16);
mm256_transpose_32x32(in0, in1);
}
+#endif // CONFIG_EXT_TX
-static void load_buffer_32x32(const int16_t *input, int stride, int flipud,
- int fliplr, __m256i *in0, __m256i *in1) {
+static INLINE void load_buffer_32x32(const int16_t *input, int stride,
+ int flipud, int fliplr, __m256i *in0,
+ __m256i *in1) {
// Load 4 16x16 blocks
const int16_t *topL = input;
const int16_t *topR = input + 16;
@@ -1708,7 +1708,6 @@
load_buffer_16x16(topR, stride, flipud, fliplr, in1);
load_buffer_16x16(botR, stride, flipud, fliplr, in1 + 16);
}
-#endif // CONFIG_EXT_TX
static void nr_right_shift_32x32_16col(__m256i *in) {
int i = 0;
@@ -1729,8 +1728,7 @@
nr_right_shift_32x32_16col(in1);
}
-#if CONFIG_EXT_TX
-static void pr_right_shift_32x32_16col(__m256i *in) {
+static INLINE void pr_right_shift_32x32_16col(__m256i *in) {
int i = 0;
const __m256i zero = _mm256_setzero_si256();
const __m256i one = _mm256_set1_epi16(1);
@@ -1745,11 +1743,12 @@
}
// Positive rounding
-static void pr_right_shift_32x32(__m256i *in0, __m256i *in1) {
+static INLINE void pr_right_shift_32x32(__m256i *in0, __m256i *in1) {
pr_right_shift_32x32_16col(in0);
pr_right_shift_32x32_16col(in1);
}
+#if CONFIG_EXT_TX
static void fidtx32_avx2(__m256i *in0, __m256i *in1) {
int i = 0;
while (i < 32) {
@@ -1761,23 +1760,42 @@
}
#endif
+static INLINE int range_check_dct32x32(const __m256i *in0, const __m256i *in1,
+ int row) {
+ __m256i value, bits0, bits1;
+ const __m256i bound = _mm256_set1_epi16((1 << 6) - 1);
+ int flag;
+ int i = 0;
+
+ while (i < row) {
+ value = _mm256_abs_epi16(in0[i]);
+ bits0 = _mm256_cmpgt_epi16(value, bound);
+ value = _mm256_abs_epi16(in1[i]);
+ bits1 = _mm256_cmpgt_epi16(value, bound);
+ bits0 = _mm256_or_si256(bits0, bits1);
+ flag = _mm256_movemask_epi8(bits0);
+ if (flag) return 1;
+ i++;
+ }
+ return 0;
+}
+
void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride,
int tx_type) {
__m256i in0[32]; // left 32 columns
__m256i in1[32]; // right 32 columns
- (void)input;
- (void)stride;
switch (tx_type) {
-// TODO(luoyi): For DCT_DCT, fwd_txfm_32x32() uses aom set. But this
-// function has better speed. The replacement must work with the
-// corresponding inverse transform.
-// case DCT_DCT:
-// load_buffer_32x32(input, stride, 0, 0, in0, in1);
-// fdct32_avx2(in0, in1);
-// pr_right_shift_32x32(in0, in1);
-// fdct32_avx2(in0, in1);
-// break;
+ case DCT_DCT:
+ load_buffer_32x32(input, stride, 0, 0, in0, in1);
+ if (range_check_dct32x32(in0, in1, 32)) {
+ aom_fdct32x32_avx2(input, output, stride);
+ return;
+ }
+ fdct32_avx2(in0, in1);
+ pr_right_shift_32x32(in0, in1);
+ fdct32_avx2(in0, in1);
+ break;
#if CONFIG_EXT_TX
case ADST_DCT:
load_buffer_32x32(input, stride, 0, 0, in0, in1);
diff --git a/test/av1_dct_test.cc b/test/av1_dct_test.cc
index ac1a551..d5c23f6 100644
--- a/test/av1_dct_test.cc
+++ b/test/av1_dct_test.cc
@@ -102,5 +102,6 @@
C, AV1FwdTxfm,
::testing::Values(FdctParam(&fdct4, &reference_dct_1d, 4, 1),
FdctParam(&fdct8, &reference_dct_1d, 8, 1),
- FdctParam(&fdct16, &reference_dct_1d, 16, 2)));
+ FdctParam(&fdct16, &reference_dct_1d, 16, 2),
+ FdctParam(&fdct32, &reference_dct_1d, 32, 3)));
} // namespace
diff --git a/test/fht32x32_test.cc b/test/fht32x32_test.cc
index a949ebf..3d07b44 100644
--- a/test/fht32x32_test.cc
+++ b/test/fht32x32_test.cc
@@ -69,6 +69,7 @@
inv_txfm_ = GET_PARAM(1);
tx_type_ = GET_PARAM(2);
pitch_ = 32;
+ height_ = 32;
fwd_txfm_ref = fht32x32_ref;
bit_depth_ = GET_PARAM(3);
mask_ = (1 << bit_depth_) - 1;
@@ -90,6 +91,7 @@
};
TEST_P(AV1Trans32x32HT, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(AV1Trans32x32HT, MemCheck) { RunMemCheck(); }
#if CONFIG_AOM_HIGHBITDEPTH
class AV1HighbdTrans32x32HT
@@ -164,8 +166,7 @@
#if HAVE_AVX2
const Ht32x32Param kArrayHt32x32Param_avx2[] = {
- // TODO(luoyi): DCT_DCT tx_type is not enabled in av1_fht32x32_c(avx2) yet.
- // make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 0, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 0, AOM_BITS_8, 1024),
make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 1, AOM_BITS_8, 1024),
make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 2, AOM_BITS_8, 1024),
make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 3, AOM_BITS_8, 1024),