Remove obsoleted skip_block for quantize_fp
1. Remove skip_block from function parameter list
2. Remove branch for skip_block == 1
3. Function list
av1_quantize_fp_{c,sse2,avx2}
av1_quantize_fp_32x32{c,avx2}
av1_quantize_fp_64x64{c,avx2}
Change-Id: Ic0f336fb5caa5a246978dc14a832b2985a4a3102
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index acfe4ab..1278a90 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -166,13 +166,13 @@
add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
specialize qw/av1_block_error avx2/;
- add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/av1_quantize_fp sse2 avx2/;
- add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/av1_quantize_fp_32x32 avx2/;
- add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/av1_quantize_fp_64x64 avx2/;
# fdct functions
diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index 757c9f0..0264951 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c
@@ -32,8 +32,8 @@
}
static void quantize_fp_helper_c(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
- const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
@@ -47,7 +47,7 @@
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
- if (!skip_block && qm_ptr == NULL && iqm_ptr == NULL) {
+ if (qm_ptr == NULL && iqm_ptr == NULL) {
const int rounding0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
{ // rc == 0
const int coeff = coeff_ptr[0];
@@ -81,7 +81,7 @@
}
}
}
- } else if (!skip_block) {
+ } else {
// Quantization pass: All coefficients with index >= zero_flag are
// skippable. Note: zero_flag can be zero.
for (i = 0; i < n_coeffs; i++) {
@@ -195,62 +195,58 @@
}
void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
- uint16_t *eob_ptr, const int16_t *scan,
- const int16_t *iscan) {
- quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
- quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
- dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 0);
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 0);
}
void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
- quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
- dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1);
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 1);
}
void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
- quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
- dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2);
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 2);
}
void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
- // obsolete skip_block
- const int skip_block = 0;
const qm_val_t *qm_ptr = qparam->qmatrix;
const qm_val_t *iqm_ptr = qparam->iqmatrix;
if (qm_ptr != NULL && iqm_ptr != NULL) {
- quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
- p->round_fp_QTX, p->quant_fp_QTX, p->quant_shift_QTX,
- qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
- sc->scan, sc->iscan, qm_ptr, iqm_ptr,
- qparam->log_scale);
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
} else {
switch (qparam->log_scale) {
case 0:
if (n_coeffs < 16) {
// TODO(jingning): Need SIMD implementation for smaller block size
// quantization.
- quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX,
p->round_fp_QTX, p->quant_fp_QTX,
p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
p->dequant_QTX, eob_ptr, sc->scan, sc->iscan,
@@ -258,34 +254,34 @@
} else {
if (qparam->tx_size == TX_4X16 || qparam->tx_size == TX_16X4 ||
qparam->tx_size == TX_8X32 || qparam->tx_size == TX_32X8)
- av1_quantize_fp_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
- p->round_fp_QTX, p->quant_fp_QTX,
- p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
- p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
+ av1_quantize_fp_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
else
- av1_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
- p->round_fp_QTX, p->quant_fp_QTX,
- p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
- p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
+ av1_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
}
break;
case 1:
if (qparam->tx_size == TX_16X64 || qparam->tx_size == TX_64X16)
- av1_quantize_fp_32x32_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+ av1_quantize_fp_32x32_c(coeff_ptr, n_coeffs, p->zbin_QTX,
p->round_fp_QTX, p->quant_fp_QTX,
p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
else
- av1_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+ av1_quantize_fp_32x32(coeff_ptr, n_coeffs, p->zbin_QTX,
p->round_fp_QTX, p->quant_fp_QTX,
p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
break;
case 2:
- av1_quantize_fp_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
- p->round_fp_QTX, p->quant_fp_QTX,
- p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
- p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
+ av1_quantize_fp_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
break;
default: assert(0);
}
diff --git a/av1/encoder/x86/av1_quantize_avx2.c b/av1/encoder/x86/av1_quantize_avx2.c
index fd7a081..b572d03 100644
--- a/av1/encoder/x86/av1_quantize_avx2.c
+++ b/av1/encoder/x86/av1_quantize_avx2.c
@@ -132,8 +132,8 @@
}
void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -143,15 +143,26 @@
(void)quant_shift_ptr;
const unsigned int step = 16;
- if (LIKELY(!skip_block)) {
- __m256i qp[3];
- __m256i coeff, thr;
- const int log_scale = 0;
+ __m256i qp[3];
+ __m256i coeff, thr;
+ const int log_scale = 0;
- init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+ read_coeff(coeff_ptr, &coeff);
+
+ __m256i eob = _mm256_setzero_si256();
+ quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+
+ update_qp(log_scale, &thr, qp);
+
+ while (n_coeffs > 0) {
read_coeff(coeff_ptr, &coeff);
-
- __m256i eob = _mm256_setzero_si256();
quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
coeff_ptr += step;
@@ -159,30 +170,8 @@
dqcoeff_ptr += step;
iscan_ptr += step;
n_coeffs -= step;
-
- update_qp(log_scale, &thr, qp);
-
- while (n_coeffs > 0) {
- read_coeff(coeff_ptr, &coeff);
- quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
-
- coeff_ptr += step;
- qcoeff_ptr += step;
- dqcoeff_ptr += step;
- iscan_ptr += step;
- n_coeffs -= step;
- }
- *eob_ptr = quant_gather_eob(eob);
- } else {
- do {
- write_zero(qcoeff_ptr);
- write_zero(dqcoeff_ptr);
- qcoeff_ptr += step;
- dqcoeff_ptr += step;
- n_coeffs -= step;
- } while (n_coeffs > 0);
- *eob_ptr = 0;
}
+ *eob_ptr = quant_gather_eob(eob);
}
static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp,
@@ -219,8 +208,8 @@
}
void av1_quantize_fp_32x32_avx2(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
- const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan_ptr, const int16_t *iscan_ptr) {
@@ -229,15 +218,26 @@
(void)quant_shift_ptr;
const unsigned int step = 16;
- if (LIKELY(!skip_block)) {
- __m256i qp[3];
- __m256i coeff, thr;
- const int log_scale = 1;
+ __m256i qp[3];
+ __m256i coeff, thr;
+ const int log_scale = 1;
- init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+ read_coeff(coeff_ptr, &coeff);
+
+ __m256i eob = _mm256_setzero_si256();
+ quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+
+ update_qp(log_scale, &thr, qp);
+
+ while (n_coeffs > 0) {
read_coeff(coeff_ptr, &coeff);
-
- __m256i eob = _mm256_setzero_si256();
quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
coeff_ptr += step;
@@ -245,31 +245,8 @@
dqcoeff_ptr += step;
iscan_ptr += step;
n_coeffs -= step;
-
- update_qp(log_scale, &thr, qp);
-
- while (n_coeffs > 0) {
- read_coeff(coeff_ptr, &coeff);
- quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
- &eob);
-
- coeff_ptr += step;
- qcoeff_ptr += step;
- dqcoeff_ptr += step;
- iscan_ptr += step;
- n_coeffs -= step;
- }
- *eob_ptr = quant_gather_eob(eob);
- } else {
- do {
- write_zero(qcoeff_ptr);
- write_zero(dqcoeff_ptr);
- qcoeff_ptr += step;
- dqcoeff_ptr += step;
- n_coeffs -= step;
- } while (n_coeffs > 0);
- *eob_ptr = 0;
}
+ *eob_ptr = quant_gather_eob(eob);
}
static INLINE void quantize_64x64(const __m256i *thr, const __m256i *qp,
@@ -310,8 +287,8 @@
}
void av1_quantize_fp_64x64_avx2(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
- const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan_ptr, const int16_t *iscan_ptr) {
@@ -320,15 +297,26 @@
(void)quant_shift_ptr;
const unsigned int step = 16;
- if (LIKELY(!skip_block)) {
- __m256i qp[3];
- __m256i coeff, thr;
- const int log_scale = 2;
+ __m256i qp[3];
+ __m256i coeff, thr;
+ const int log_scale = 2;
- init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+ read_coeff(coeff_ptr, &coeff);
+
+ __m256i eob = _mm256_setzero_si256();
+ quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+
+ update_qp(log_scale, &thr, qp);
+
+ while (n_coeffs > 0) {
read_coeff(coeff_ptr, &coeff);
-
- __m256i eob = _mm256_setzero_si256();
quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
coeff_ptr += step;
@@ -336,29 +324,6 @@
dqcoeff_ptr += step;
iscan_ptr += step;
n_coeffs -= step;
-
- update_qp(log_scale, &thr, qp);
-
- while (n_coeffs > 0) {
- read_coeff(coeff_ptr, &coeff);
- quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
- &eob);
-
- coeff_ptr += step;
- qcoeff_ptr += step;
- dqcoeff_ptr += step;
- iscan_ptr += step;
- n_coeffs -= step;
- }
- *eob_ptr = quant_gather_eob(eob);
- } else {
- do {
- write_zero(qcoeff_ptr);
- write_zero(dqcoeff_ptr);
- qcoeff_ptr += step;
- dqcoeff_ptr += step;
- n_coeffs -= step;
- } while (n_coeffs > 0);
- *eob_ptr = 0;
}
+ *eob_ptr = quant_gather_eob(eob);
}
diff --git a/av1/encoder/x86/av1_quantize_sse2.c b/av1/encoder/x86/av1_quantize_sse2.c
index 4f7c095..0f10e38 100644
--- a/av1/encoder/x86/av1_quantize_sse2.c
+++ b/av1/encoder/x86/av1_quantize_sse2.c
@@ -68,8 +68,8 @@
}
void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -88,39 +88,104 @@
n_coeffs = -n_coeffs;
zero = _mm_setzero_si128();
- if (!skip_block) {
- __m128i eob;
- __m128i round, quant, dequant;
+ __m128i eob;
+ __m128i round, quant, dequant;
+ {
+ __m128i coeff0, coeff1;
+
+ // Setup global values
{
- __m128i coeff0, coeff1;
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ }
- // Setup global values
- {
- round = _mm_load_si128((const __m128i *)round_ptr);
- quant = _mm_load_si128((const __m128i *)quant_ptr);
- dequant = _mm_load_si128((const __m128i *)dequant_ptr);
- }
+ {
+ __m128i coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i qtmp0, qtmp1;
+ // Do DC and first 15 AC
+ read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
- // Do DC and first 15 AC
- read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
+ // Poor man's sign extract
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ round = _mm_unpackhi_epi64(round, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
+ }
+
+ {
+ // Scan for eob
+ __m128i zero_coeff0, zero_coeff1;
+ __m128i nzero_coeff0, nzero_coeff1;
+ __m128i iscan0, iscan1;
+ __m128i eob1;
+ zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+ iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+ eob = _mm_and_si128(iscan0, nzero_coeff0);
+ eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+ eob = _mm_max_epi16(eob, eob1);
+ }
+ n_coeffs += 8 * 2;
+ }
+
+ thr = _mm_srai_epi16(dequant, 1);
+
+ // AC only loop
+ while (n_coeffs < 0) {
+ __m128i coeff0, coeff1;
+ {
+ __m128i coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i qtmp0, qtmp1;
+
+ read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
+
+ // Poor man's sign extract
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+ if (nzflag) {
qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- round = _mm_unpackhi_epi64(round, round);
qcoeff1 = _mm_adds_epi16(qcoeff1, round);
qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- quant = _mm_unpackhi_epi64(quant, quant);
qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
// Reinsert signs
@@ -132,121 +197,47 @@
write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- dequant = _mm_unpackhi_epi64(dequant, dequant);
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
+ } else {
+ write_zero(qcoeff_ptr, n_coeffs);
+ write_zero(dqcoeff_ptr, n_coeffs);
}
-
- {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob = _mm_max_epi16(eob, eob1);
- }
- n_coeffs += 8 * 2;
}
- thr = _mm_srai_epi16(dequant, 1);
-
- // AC only loop
- while (n_coeffs < 0) {
- __m128i coeff0, coeff1;
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
-
- read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
-
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
- _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
-
- if (nzflag) {
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
-
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
- write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
- } else {
- write_zero(qcoeff_ptr, n_coeffs);
- write_zero(dqcoeff_ptr, n_coeffs);
- }
- }
-
- if (nzflag) {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob0, eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob0 = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob0 = _mm_max_epi16(eob0, eob1);
- eob = _mm_max_epi16(eob, eob0);
- }
- n_coeffs += 8 * 2;
+ if (nzflag) {
+ // Scan for eob
+ __m128i zero_coeff0, zero_coeff1;
+ __m128i nzero_coeff0, nzero_coeff1;
+ __m128i iscan0, iscan1;
+ __m128i eob0, eob1;
+ zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+ iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+ eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+ eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+ eob0 = _mm_max_epi16(eob0, eob1);
+ eob = _mm_max_epi16(eob, eob0);
}
+ n_coeffs += 8 * 2;
+ }
- // Accumulate EOB
- {
- __m128i eob_shuffled;
- eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
- eob = _mm_max_epi16(eob, eob_shuffled);
- *eob_ptr = _mm_extract_epi16(eob, 1);
- }
- } else {
- do {
- write_zero(dqcoeff_ptr, n_coeffs);
- write_zero(qcoeff_ptr, n_coeffs);
- n_coeffs += 8 * 2;
- } while (n_coeffs < 0);
- *eob_ptr = 0;
+ // Accumulate EOB
+ {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ *eob_ptr = _mm_extract_epi16(eob, 1);
}
}
diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc
index 71c8d94..5318f21 100644
--- a/test/quantize_func_test.cc
+++ b/test/quantize_func_test.cc
@@ -34,14 +34,26 @@
const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, \
const int16_t *iscan
+#define QUAN_PARAM_LIST_NO_SKIP \
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, \
+ const int16_t *round_ptr, const int16_t *quant_ptr, \
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, \
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, \
+ const int16_t *scan, const int16_t *iscan
+
typedef void (*QuantizeFunc)(QUAN_PARAM_LIST);
typedef void (*QuantizeFuncHbd)(QUAN_PARAM_LIST, int log_scale);
+typedef void (*QuantizeFuncNoSkip)(QUAN_PARAM_LIST_NO_SKIP);
#define HBD_QUAN_FUNC \
fn(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, quant_ptr, \
quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, \
iscan, log_scale)
+#define LBD_QUAN_FUNC_NO_SKIP \
+ fn(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, \
+ qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan)
+
template <QuantizeFuncHbd fn>
void highbd_quan16x16_wrapper(QUAN_PARAM_LIST) {
const int log_scale = 0;
@@ -60,6 +72,12 @@
HBD_QUAN_FUNC;
}
+template <QuantizeFuncNoSkip fn>
+void lowbd_quan_wrapper(QUAN_PARAM_LIST) {
+ (void)skip_block;
+ LBD_QUAN_FUNC_NO_SKIP;
+}
+
typedef enum { TYPE_B, TYPE_DC, TYPE_FP } QuantType;
typedef ::testing::tuple<QuantizeFunc, QuantizeFunc, TX_SIZE, QuantType,
@@ -297,12 +315,15 @@
#if HAVE_AVX2
const QuantizeParam kQParamArrayAvx2[] = {
- make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, TX_16X16, TYPE_FP,
+ make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
+ &lowbd_quan_wrapper<av1_quantize_fp_avx2>, TX_16X16, TYPE_FP,
AOM_BITS_8),
- make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2, TX_32X32,
- TYPE_FP, AOM_BITS_8),
- make_tuple(&av1_quantize_fp_64x64_c, &av1_quantize_fp_64x64_avx2, TX_64X64,
- TYPE_FP, AOM_BITS_8),
+ make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_32x32_c>,
+ &lowbd_quan_wrapper<av1_quantize_fp_32x32_avx2>, TX_32X32, TYPE_FP,
+ AOM_BITS_8),
+ make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_64x64_c>,
+ &lowbd_quan_wrapper<av1_quantize_fp_64x64_avx2>, TX_64X64, TYPE_FP,
+ AOM_BITS_8),
make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>, TX_16X16,
TYPE_FP, AOM_BITS_8),
@@ -344,7 +365,8 @@
#if HAVE_SSE2
const QuantizeParam kQParamArraySSE2[] = {
- make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, TX_16X16, TYPE_FP,
+ make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
+ &lowbd_quan_wrapper<av1_quantize_fp_sse2>, TX_16X16, TYPE_FP,
AOM_BITS_8),
make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2, TX_16X16,
TYPE_B, AOM_BITS_8),