Add CONFIG_DAALA_DCT4 experiment.
This experiment replaces the 4-point Type-II scaled-output vp9 DCT
transform with the 4-point Type-II orthonormal Daala DCT transform.
Right now the CONFIG_DAALA_DCT4 experiment depends on CONFIG_DCT_ONLY
as it does not add an orthonormal 4-point DST.
subset-1:
monty-baseline-dctonly-squaretx-subset1 ->
monty-dct4-dctonly-squaretx-subset1-rerun
PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000
0.0055 | -0.0132 | -0.0405 | 0.0261 | 0.0005 | 0.0246 | 0.0226
objective-1-fast:
monty-baseline-dctonly-squaretx-o1f ->
monty-dct4-dctonly-squaretx-o1f
PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000
-0.0215 | -0.1573 | N/A | -0.0131 | -0.0347 | -0.0390 | -0.1121
Change-Id: Idef8f6e5525037d5bbb2d0927675c21d1922d69a
diff --git a/aom_dsp/inv_txfm.c b/aom_dsp/inv_txfm.c
index c4efae6..13bebef 100644
--- a/aom_dsp/inv_txfm.c
+++ b/aom_dsp/inv_txfm.c
@@ -14,6 +14,9 @@
#include "./aom_dsp_rtcd.h"
#include "aom_dsp/inv_txfm.h"
+#if CONFIG_DAALA_DCT4
+#include "av1/common/daala_tx.h"
+#endif
void aom_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
@@ -93,6 +96,18 @@
}
}
+#if CONFIG_DAALA_DCT4
+void aom_idct4_c(const tran_low_t *input, tran_low_t *output) {
+ int i;
+ od_coeff x[4];
+ od_coeff y[4];
+ for (i = 0; i < 4; i++) y[i] = input[i];
+ od_bin_idct4(x, 1, y);
+ for (i = 0; i < 4; i++) output[i] = (tran_low_t)x[i];
+}
+
+#else
+
void aom_idct4_c(const tran_low_t *input, tran_low_t *output) {
tran_low_t step[4];
tran_high_t temp1, temp2;
@@ -112,6 +127,7 @@
output[2] = WRAPLOW(step[1] - step[2]);
output[3] = WRAPLOW(step[0] - step[3]);
}
+#endif
void aom_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
tran_low_t out[4 * 4];
diff --git a/av1/av1.cmake b/av1/av1.cmake
index f269c72..c2811ca 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -33,6 +33,8 @@
"${AOM_ROOT}/av1/common/common_data.h"
"${AOM_ROOT}/av1/common/convolve.c"
"${AOM_ROOT}/av1/common/convolve.h"
+ "${AOM_ROOT}/av1/common/daala_tx.c"
+ "${AOM_ROOT}/av1/common/daala_tx.h"
"${AOM_ROOT}/av1/common/debugmodes.c"
"${AOM_ROOT}/av1/common/entropy.c"
"${AOM_ROOT}/av1/common/entropy.h"
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index c18f7d5..95263ab 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -24,6 +24,8 @@
AV1_COMMON_SRCS-yes += common/alloccommon.h
AV1_COMMON_SRCS-yes += common/blockd.h
AV1_COMMON_SRCS-yes += common/common.h
+AV1_COMMON_SRCS-yes += common/daala_tx.c
+AV1_COMMON_SRCS-yes += common/daala_tx.h
AV1_COMMON_SRCS-yes += common/entropy.h
AV1_COMMON_SRCS-yes += common/entropymode.h
AV1_COMMON_SRCS-yes += common/entropymv.h
diff --git a/av1/common/daala_tx.c b/av1/common/daala_tx.c
new file mode 100644
index 0000000..32ad568
--- /dev/null
+++ b/av1/common/daala_tx.c
@@ -0,0 +1,119 @@
+#include "av1/common/daala_tx.h"
+#include "av1/common/odintrin.h"
+
+/* clang-format off */
+
+# define OD_DCT_RSHIFT(_a, _b) OD_UNBIASED_RSHIFT32(_a, _b)
+
+/* TODO: Daala DCT overflow checks need to be ported as a later test */
+# if defined(OD_DCT_CHECK_OVERFLOW)
+# else
+# define OD_DCT_OVERFLOW_CHECK(val, scale, offset, idx)
+# endif
+
+#define OD_FDCT_2_ASYM(p0, p1, p1h) \
+ /* Embedded 2-point asymmetric Type-II fDCT. */ \
+ do { \
+ p0 += p1h; \
+ p1 = p0 - p1; \
+ } \
+ while (0)
+
+#define OD_IDCT_2_ASYM(p0, p1, p1h) \
+ /* Embedded 2-point asymmetric Type-II iDCT. */ \
+ do { \
+ p1 = p0 - p1; \
+ p1h = OD_DCT_RSHIFT(p1, 1); \
+ p0 -= p1h; \
+ } \
+ while (0)
+
+#define OD_FDST_2_ASYM(p0, p1) \
+ /* Embedded 2-point asymmetric Type-IV fDST. */ \
+ do { \
+ /* 11507/16384 ~= 4*Sin[Pi/8] - 2*Tan[Pi/8] ~= 0.702306604714169 */ \
+ OD_DCT_OVERFLOW_CHECK(p1, 11507, 8192, 187); \
+ p0 -= (p1*11507 + 8192) >> 14; \
+ /* 669/1024 ~= Cos[Pi/8]/Sqrt[2] ~= 0.653281482438188 */ \
+ OD_DCT_OVERFLOW_CHECK(p0, 669, 512, 188); \
+ p1 += (p0*669 + 512) >> 10; \
+ /* 4573/4096 ~= 4*Sin[Pi/8] - Tan[Pi/8] ~= 1.11652016708726 */ \
+ OD_DCT_OVERFLOW_CHECK(p1, 4573, 2048, 189); \
+ p0 -= (p1*4573 + 2048) >> 12; \
+ } \
+ while (0)
+
+#define OD_IDST_2_ASYM(p0, p1) \
+ /* Embedded 2-point asymmetric Type-IV iDST. */ \
+ do { \
+ /* 4573/4096 ~= 4*Sin[Pi/8] - Tan[Pi/8] ~= 1.11652016708726 */ \
+ p0 += (p1*4573 + 2048) >> 12; \
+ /* 669/1024 ~= Cos[Pi/8]/Sqrt[2] ~= 0.653281482438188 */ \
+ p1 -= (p0*669 + 512) >> 10; \
+ /* 11507/16384 ~= 4*Sin[Pi/8] - 2*Tan[Pi/8] ~= 0.702306604714169 */ \
+ p0 += (p1*11507 + 8192) >> 14; \
+ } \
+ while (0)
+
+#define OD_FDCT_4(q0, q2, q1, q3) \
+ /* Embedded 4-point orthonormal Type-II fDCT. */ \
+ do { \
+ int q2h; \
+ int q3h; \
+ q3 = q0 - q3; \
+ q3h = OD_DCT_RSHIFT(q3, 1); \
+ q0 -= q3h; \
+ q2 += q1; \
+ q2h = OD_DCT_RSHIFT(q2, 1); \
+ q1 = q2h - q1; \
+ OD_FDCT_2_ASYM(q0, q2, q2h); \
+ OD_FDST_2_ASYM(q3, q1); \
+ } \
+ while (0)
+
+#define OD_IDCT_4(q0, q2, q1, q3) \
+ /* Embedded 4-point orthonormal Type-II iDCT. */ \
+ do { \
+ int q1h; \
+ int q3h; \
+ OD_IDST_2_ASYM(q3, q2); \
+ OD_IDCT_2_ASYM(q0, q1, q1h); \
+ q3h = OD_DCT_RSHIFT(q3, 1); \
+ q0 += q3h; \
+ q3 = q0 - q3; \
+ q2 = q1h - q2; \
+ q1 -= q2; \
+ } \
+ while (0)
+
+void od_bin_fdct4(od_coeff y[4], const od_coeff *x, int xstride) {
+ int q0;
+ int q1;
+ int q2;
+ int q3;
+ q0 = x[0*xstride];
+ q2 = x[1*xstride];
+ q1 = x[2*xstride];
+ q3 = x[3*xstride];
+ OD_FDCT_4(q0, q2, q1, q3);
+ y[0] = (od_coeff)q0;
+ y[1] = (od_coeff)q1;
+ y[2] = (od_coeff)q2;
+ y[3] = (od_coeff)q3;
+}
+
+void od_bin_idct4(od_coeff *x, int xstride, const od_coeff y[4]) {
+ int q0;
+ int q1;
+ int q2;
+ int q3;
+ q0 = y[0];
+ q2 = y[1];
+ q1 = y[2];
+ q3 = y[3];
+ OD_IDCT_4(q0, q2, q1, q3);
+ x[0*xstride] = q0;
+ x[1*xstride] = q1;
+ x[2*xstride] = q2;
+ x[3*xstride] = q3;
+}
diff --git a/av1/common/daala_tx.h b/av1/common/daala_tx.h
new file mode 100644
index 0000000..5b739c6
--- /dev/null
+++ b/av1/common/daala_tx.h
@@ -0,0 +1,9 @@
+#ifndef AOM_DSP_DAALA_TX_H_
+#define AOM_DSP_DAALA_TX_H_
+
+#include "av1/common/odintrin.h"
+
+void od_bin_fdct4(od_coeff y[4], const od_coeff *x, int xstride);
+void od_bin_idct4(od_coeff *x, int xstride, const od_coeff y[4]);
+
+#endif
diff --git a/av1/common/idct.c b/av1/common/idct.c
index 49a91fb..80a671d 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -36,8 +36,13 @@
#if CONFIG_EXT_TX
static void iidtx4_c(const tran_low_t *input, tran_low_t *output) {
int i;
- for (i = 0; i < 4; ++i)
+ for (i = 0; i < 4; ++i) {
+#if CONFIG_DAALA_DCT4
+ output[i] = input[i];
+#else
output[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+#endif
+ }
}
static void iidtx8_c(const tran_low_t *input, tran_low_t *output) {
@@ -249,10 +254,12 @@
void av1_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
const INV_TXFM_PARAM *param) {
int tx_type = param->tx_type;
+#if !CONFIG_DAALA_DCT4
if (tx_type == DCT_DCT) {
aom_idct4x4_16_add(input, dest, stride);
return;
}
+#endif
static const transform_2d IHT_4[] = {
{ aom_idct4_c, aom_idct4_c }, // DCT_DCT = 0
{ aom_iadst4_c, aom_idct4_c }, // ADST_DCT = 1
@@ -293,12 +300,18 @@
// inverse transform row vectors
for (i = 0; i < 4; ++i) {
+#if CONFIG_DAALA_DCT4
+ tran_low_t temp_in[4];
+ for (j = 0; j < 4; j++) temp_in[j] = input[j] << 1;
+ IHT_4[tx_type].rows(temp_in, out[i]);
+#else
#if CONFIG_LGT
if (use_lgt_row)
ilgt4(input, out[i], lgtmtx_row[i]);
else
#endif
IHT_4[tx_type].rows(input, out[i]);
+#endif
input += 4;
}
@@ -328,7 +341,11 @@
for (j = 0; j < 4; ++j) {
int d = i * stride + j;
int s = j * outstride + i;
+#if CONFIG_DAALA_DCT4
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
+#else
+ dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
+#endif
}
}
}
@@ -1440,7 +1457,11 @@
}
switch (tx_type) {
+#if !CONFIG_DAALA_DCT4
case DCT_DCT: av1_idct4x4_add(input, dest, stride, param); break;
+#else
+ case DCT_DCT:
+#endif
case ADST_DCT:
case DCT_ADST:
case ADST_ADST:
diff --git a/av1/common/odintrin.h b/av1/common/odintrin.h
index 3a778ab..a50c456 100644
--- a/av1/common/odintrin.h
+++ b/av1/common/odintrin.h
@@ -89,6 +89,12 @@
typedef int od_coeff;
+/*This is the strength reduced version of ((_a)/(1 << (_b))).
+ This will not work for _b == 0, however currently this is only used for
+ b == 1 anyway.*/
+# define OD_UNBIASED_RSHIFT32(_a, _b) \
+ (((int32_t)(((uint32_t)(_a) >> (32 - (_b))) + (_a))) >> (_b))
+
#define OD_DIVU_DMAX (1024)
extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index 198c76d..e9d166c 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -21,6 +21,9 @@
#include "av1/common/av1_fwd_txfm1d.h"
#include "av1/common/av1_fwd_txfm1d_cfg.h"
#include "av1/common/idct.h"
+#if CONFIG_DAALA_DCT4
+#include "av1/common/daala_tx.h"
+#endif
static INLINE void range_check(const tran_low_t *input, const int size,
const int bit) {
@@ -39,6 +42,18 @@
#endif
}
+#if CONFIG_DAALA_DCT4
+static void fdct4(const tran_low_t *input, tran_low_t *output) {
+ int i;
+ od_coeff x[4];
+ od_coeff y[4];
+ for (i = 0; i < 4; i++) x[i] = (od_coeff)input[i];
+ od_bin_fdct4(y, x, 1);
+ for (i = 0; i < 4; i++) output[i] = (tran_low_t)y[i];
+}
+
+#else
+
static void fdct4(const tran_low_t *input, tran_low_t *output) {
tran_high_t temp;
tran_low_t step[4];
@@ -74,6 +89,7 @@
range_check(output, 4, 16);
}
+#endif
static void fdct8(const tran_low_t *input, tran_low_t *output) {
tran_high_t temp;
@@ -1078,8 +1094,13 @@
// being used for square transforms.
static void fidtx4(const tran_low_t *input, tran_low_t *output) {
int i;
- for (i = 0; i < 4; ++i)
+ for (i = 0; i < 4; ++i) {
+#if CONFIG_DAALA_DCT4
+ output[i] = input[i];
+#else
output[i] = (tran_low_t)fdct_round_shift(input[i] * Sqrt2);
+#endif
+ }
}
static void fidtx8(const tran_low_t *input, tran_low_t *output) {
@@ -1199,9 +1220,13 @@
#if CONFIG_DCT_ONLY
assert(tx_type == DCT_DCT);
#endif
+#if !CONFIG_DAALA_DCT4
if (tx_type == DCT_DCT) {
aom_fdct4x4_c(input, output, stride);
- } else {
+ return;
+ }
+#endif
+ {
static const transform_2d FHT[] = {
{ fdct4, fdct4 }, // DCT_DCT
{ fadst4, fdct4 }, // ADST_DCT
@@ -1243,8 +1268,11 @@
// Columns
for (i = 0; i < 4; ++i) {
+ /* A C99-safe upshift by 4 for both Daala and VPx TX. */
for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16;
+#if !CONFIG_DAALA_DCT4
if (i == 0 && temp_in[0]) temp_in[0] += 1;
+#endif
#if CONFIG_LGT
if (use_lgt_col)
flgt4(temp_in, temp_out, lgtmtx_col[i]);
@@ -1263,7 +1291,13 @@
else
#endif
ht.rows(temp_in, temp_out);
+#if CONFIG_DAALA_DCT4
+ /* Daala TX has orthonormal scaling; shift down by only 1 to achieve
+ the usual VPx coefficient left-shift of 3. */
+ for (j = 0; j < 4; ++j) output[j + i * 4] = temp_out[j] >> 1;
+#else
for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2;
+#endif
}
}
}
diff --git a/configure b/configure
index b4f75fc..96faac0 100755
--- a/configure
+++ b/configure
@@ -291,6 +291,7 @@
cfl
xiphrc
dct_only
+ daala_dct4
cb4x4
chroma_2x2
chroma_sub8x8
@@ -559,6 +560,14 @@
log_echo "ec_smallmul requires not ans, so disabling ec_smallmul"
disable_feature ec_smallmul
fi
+ if enabled daala_dct4; then
+ enable_feature dct_only
+ disable_feature mmx
+ disable_feature rect_tx
+ disable_feature var_tx
+ disable_feature lgt
+ enable_feature lowbitdepth
+ fi
if enabled ext_tile; then
log_echo "ext_tile not compatible with reference_buffer, so"
log_echo "disabling reference_buffer"
diff --git a/test/av1_dct_test.cc b/test/av1_dct_test.cc
index 691cc8b..2d4f26b 100644
--- a/test/av1_dct_test.cc
+++ b/test/av1_dct_test.cc
@@ -23,6 +23,9 @@
#define CONFIG_COEFFICIENT_RANGE_CHECKING 1
#define AV1_DCT_GTEST
#include "av1/encoder/dct.c"
+#if CONFIG_DAALA_DCT4
+#include "av1/common/daala_tx.c"
+#endif
using libaom_test::ACMRandom;