Merge "Renamings for OBMC experiment" into nextgenv2
diff --git a/aom_dsp/ans.c b/aom_dsp/ans.c
index 18f6d48..30f115c 100644
--- a/aom_dsp/ans.c
+++ b/aom_dsp/ans.c
@@ -15,16 +15,7 @@
#include "aom_dsp/ans.h"
#include "aom_dsp/prob.h"
-void aom_rans_build_cdf_from_pdf(const AnsP10 token_probs[], rans_lut cdf_tab) {
- int i;
- cdf_tab[0] = 0;
- for (i = 1; cdf_tab[i - 1] < RANS_PRECISION; ++i) {
- cdf_tab[i] = cdf_tab[i - 1] + token_probs[i - 1];
- }
- assert(cdf_tab[i - 1] == RANS_PRECISION);
-}
-
-static int find_largest(const AnsP10 *const pdf_tab, int num_syms) {
+static int find_largest(const aom_cdf_prob *const pdf_tab, int num_syms) {
int largest_idx = -1;
int largest_p = -1;
int i;
@@ -38,8 +29,9 @@
return largest_idx;
}
-void aom_rans_merge_prob8_pdf(AnsP10 *const out_pdf, const AnsP8 node_prob,
- const AnsP10 *const src_pdf, int in_syms) {
+void aom_rans_merge_prob8_pdf(aom_cdf_prob *const out_pdf,
+ const AnsP8 node_prob,
+ const aom_cdf_prob *const src_pdf, int in_syms) {
int i;
int adjustment = RANS_PRECISION;
const int round_fact = ANS_P8_PRECISION >> 1;
diff --git a/aom_dsp/ans.h b/aom_dsp/ans.h
index ea99f8b..5927e58 100644
--- a/aom_dsp/ans.h
+++ b/aom_dsp/ans.h
@@ -26,24 +26,16 @@
typedef uint8_t AnsP8;
#define ANS_P8_PRECISION 256u
#define ANS_P8_SHIFT 8
-typedef uint16_t AnsP10;
-#define ANS_P10_PRECISION 1024u
+#define RANS_PRECISION 1024u
#define RANS_PROB_BITS 10
-#define RANS_PRECISION ANS_P10_PRECISION
-
-#define L_BASE (ANS_P10_PRECISION * 4) // L_BASE % precision must be 0
+#define L_BASE (RANS_PRECISION * 4) // L_BASE % precision must be 0
#define IO_BASE 256
// Range I = { L_BASE, L_BASE + 1, ..., L_BASE * IO_BASE - 1 }
-// This is now just a boring cdf. It starts with an explicit zero.
-// TODO(aconverse): Remove starting zero.
-typedef uint16_t rans_lut[16];
-
-void aom_rans_build_cdf_from_pdf(const AnsP10 token_probs[], rans_lut cdf_tab);
-
-void aom_rans_merge_prob8_pdf(AnsP10 *const out_pdf, const AnsP8 node_prob,
- const AnsP10 *const src_pdf, int in_syms);
+void aom_rans_merge_prob8_pdf(aom_cdf_prob *const out_pdf,
+ const AnsP8 node_prob,
+ const aom_cdf_prob *const src_pdf, int in_syms);
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
diff --git a/aom_dsp/ansreader.h b/aom_dsp/ansreader.h
index 11619b0..1f66531 100644
--- a/aom_dsp/ansreader.h
+++ b/aom_dsp/ansreader.h
@@ -62,24 +62,25 @@
struct rans_dec_sym {
uint8_t val;
- AnsP10 prob;
- AnsP10 cum_prob; // not-inclusive
+ aom_cdf_prob prob;
+ aom_cdf_prob cum_prob; // not-inclusive
};
-static INLINE void fetch_sym(struct rans_dec_sym *out, const rans_lut cdf,
- AnsP10 rem) {
- int i = 0;
+static INLINE void fetch_sym(struct rans_dec_sym *out, const aom_cdf_prob *cdf,
+ aom_cdf_prob rem) {
+ int i;
+ aom_cdf_prob cum_prob = 0, top_prob;
// TODO(skal): if critical, could be a binary search.
// Or, better, an O(1) alias-table.
- while (rem >= cdf[i]) {
- ++i;
+ for (i = 0; rem >= (top_prob = cdf[i]); ++i) {
+ cum_prob = top_prob;
}
- out->val = i - 1;
- out->prob = (AnsP10)(cdf[i] - cdf[i - 1]);
- out->cum_prob = (AnsP10)cdf[i - 1];
+ out->val = i;
+ out->prob = top_prob - cum_prob;
+ out->cum_prob = cum_prob;
}
-static INLINE int rans_read(struct AnsDecoder *ans, const rans_lut tab) {
+static INLINE int rans_read(struct AnsDecoder *ans, const aom_cdf_prob *tab) {
unsigned rem;
unsigned quo;
struct rans_dec_sym sym;
diff --git a/aom_dsp/answriter.h b/aom_dsp/answriter.h
index 5a82d35..0ac1bda 100644
--- a/aom_dsp/answriter.h
+++ b/aom_dsp/answriter.h
@@ -75,8 +75,8 @@
}
struct rans_sym {
- AnsP10 prob;
- AnsP10 cum_prob; // not-inclusive
+ aom_cdf_prob prob;
+ aom_cdf_prob cum_prob; // not-inclusive
};
// rANS with normalization
@@ -84,7 +84,7 @@
// ANS_P10_PRECISION is m
static INLINE void rans_write(struct AnsCoder *ans,
const struct rans_sym *const sym) {
- const AnsP10 p = sym->prob;
+ const aom_cdf_prob p = sym->prob;
while (ans->state >= L_BASE / RANS_PRECISION * IO_BASE * p) {
ans->buf[ans->buf_offset++] = ans->state % IO_BASE;
ans->state /= IO_BASE;
diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index 036aef0..25f98a8 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -9,6 +9,7 @@
## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
##
+
DSP_SRCS-yes += aom_dsp.mk
DSP_SRCS-yes += aom_dsp_common.h
@@ -72,8 +73,6 @@
DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.c
# inter predictions
-
-ifeq ($(CONFIG_AV1),yes)
DSP_SRCS-yes += blend.h
DSP_SRCS-yes += blend_a64_mask.c
DSP_SRCS-yes += blend_a64_hmask.c
@@ -82,7 +81,6 @@
DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c
DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c
DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c
-endif #CONFIG_AV1
# interpolation filters
DSP_SRCS-yes += aom_convolve.c
@@ -101,7 +99,6 @@
DSP_SRCS-$(HAVE_SSE2) += x86/aom_high_subpixel_8t_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/aom_high_subpixel_bilinear_sse2.asm
endif
-
DSP_SRCS-$(HAVE_SSE2) += x86/aom_convolve_copy_sse2.asm
ifeq ($(HAVE_NEON_ASM),yes)
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 8d17d03..6af5588 100644
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -44,6 +44,27 @@
# Intra prediction
#
+add_proto qw/void aom_dc_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_dc_predictor_2x2/;
+
+add_proto qw/void aom_dc_top_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_dc_top_predictor_2x2/;
+
+add_proto qw/void aom_dc_left_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_dc_left_predictor_2x2/;
+
+add_proto qw/void aom_dc_128_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_dc_128_predictor_2x2/;
+
+add_proto qw/void aom_v_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_v_predictor_2x2/;
+
+add_proto qw/void aom_h_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_h_predictor_2x2/;
+
+add_proto qw/void aom_tm_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_tm_predictor_2x2/;
+
add_proto qw/void aom_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/aom_d207_predictor_4x4 sse2/;
@@ -648,59 +669,32 @@
#
# Forward transform
#
-if ((aom_config("CONFIG_AV1_ENCODER") eq "yes")) {
-if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
- add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct4x4 sse2/;
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_highbd_fdct4x4 sse2/;
- add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct4x4_1 sse2/;
+ add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_highbd_fdct8x8 sse2/;
- add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct8x8 sse2/;
+ add_proto qw/void aom_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_highbd_fdct8x8_1/;
- add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct8x8_1 sse2/;
+ add_proto qw/void aom_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_highbd_fdct16x16 sse2/;
- add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct16x16 sse2/;
+ add_proto qw/void aom_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_highbd_fdct16x16_1/;
- add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct16x16_1 sse2/;
+ add_proto qw/void aom_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_highbd_fdct32x32 sse2/;
- add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct32x32 sse2/;
+ add_proto qw/void aom_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_highbd_fdct32x32_rd sse2/;
- add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct32x32_rd sse2/;
-
- add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct32x32_1 sse2/;
-
- add_proto qw/void aom_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_highbd_fdct4x4 sse2/;
-
- add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_highbd_fdct8x8 sse2/;
-
- add_proto qw/void aom_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_highbd_fdct8x8_1/;
-
- add_proto qw/void aom_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_highbd_fdct16x16 sse2/;
-
- add_proto qw/void aom_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_highbd_fdct16x16_1/;
-
- add_proto qw/void aom_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_highbd_fdct32x32 sse2/;
-
- add_proto qw/void aom_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_highbd_fdct32x32_rd sse2/;
-
- add_proto qw/void aom_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_highbd_fdct32x32_1/;
-} else {
+ add_proto qw/void aom_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_highbd_fdct32x32_1/;
+ } # CONFIG_AOM_HIGHBITDEPTH
add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/aom_fdct4x4 sse2 msa/;
@@ -726,8 +720,7 @@
specialize qw/aom_fdct32x32_rd sse2 avx2 msa/;
add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct32x32_1 sse2 msa/;
-} # CONFIG_AOM_HIGHBITDEPTH
+ specialize qw/aom_fdct32x32_1 sse2 avx2 msa/;
} # CONFIG_AV1_ENCODER
#
diff --git a/aom_dsp/arm/loopfilter_4_neon.asm b/aom_dsp/arm/loopfilter_4_neon.asm
index e82dea5..8b54984 100644
--- a/aom_dsp/arm/loopfilter_4_neon.asm
+++ b/aom_dsp/arm/loopfilter_4_neon.asm
@@ -17,7 +17,7 @@
AREA ||.text||, CODE, READONLY, ALIGN=2
-; Currently aom only works on iterations 8 at a time. The vp8 loop filter
+; Currently aom only works on iterations 8 at a time. The aom loop filter
; works on 16 iterations at a time.
;
; void aom_lpf_horizontal_4_neon(uint8_t *s,
@@ -66,7 +66,7 @@
pop {pc}
ENDP ; |aom_lpf_horizontal_4_neon|
-; Currently aom only works on iterations 8 at a time. The vp8 loop filter
+; Currently aom only works on iterations 8 at a time. The aom loop filter
; works on 16 iterations at a time.
;
; void aom_lpf_vertical_4_neon(uint8_t *s,
diff --git a/aom_dsp/arm/loopfilter_8_neon.asm b/aom_dsp/arm/loopfilter_8_neon.asm
index 23b819b..9f3db66 100644
--- a/aom_dsp/arm/loopfilter_8_neon.asm
+++ b/aom_dsp/arm/loopfilter_8_neon.asm
@@ -17,7 +17,7 @@
AREA ||.text||, CODE, READONLY, ALIGN=2
-; Currently aom only works on iterations 8 at a time. The vp8 loop filter
+; Currently aom only works on iterations 8 at a time. The aom loop filter
; works on 16 iterations at a time.
;
; void aom_lpf_horizontal_8_neon(uint8_t *s, int p,
diff --git a/aom_dsp/bitreader.h b/aom_dsp/bitreader.h
index d062e07..52e4dc8 100644
--- a/aom_dsp/bitreader.h
+++ b/aom_dsp/bitreader.h
@@ -104,6 +104,20 @@
return aom_read_tree_bits(r, tree, probs);
}
+static INLINE int aom_read_symbol(aom_reader *r, const aom_cdf_prob *cdf,
+ int nsymbs) {
+#if CONFIG_ANS
+ (void)nsymbs;
+ return rans_read(r, cdf);
+#else
+ (void)r;
+ (void)cdf;
+ (void)nsymbs;
+ assert(0 && "Unsupported bitreader operation");
+ return -1;
+#endif
+}
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/aom_dsp/bitwriter.h b/aom_dsp/bitwriter.h
index 5e34fd6..d6937aa 100644
--- a/aom_dsp/bitwriter.h
+++ b/aom_dsp/bitwriter.h
@@ -86,6 +86,24 @@
aom_write_tree_bits(w, tree, probs, bits, len, i);
}
+static INLINE void aom_write_symbol(aom_writer *w, int symb,
+ const aom_cdf_prob *cdf, int nsymbs) {
+#if CONFIG_ANS
+ struct rans_sym s;
+ (void)nsymbs;
+ assert(cdf);
+ s.cum_prob = symb > 0 ? cdf[symb - 1] : 0;
+ s.prob = cdf[symb] - s.cum_prob;
+ buf_rans_write(w, &s);
+#else
+ (void)w;
+ (void)symb;
+ (void)cdf;
+ (void)nsymbs;
+ assert(0 && "Unsupported bitwriter operation");
+#endif
+}
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/aom_dsp/intrapred.c b/aom_dsp/intrapred.c
index 1e40e68..c3af1f4 100644
--- a/aom_dsp/intrapred.c
+++ b/aom_dsp/intrapred.c
@@ -837,6 +837,7 @@
/* clang-format off */
#define intra_pred_allsizes(type) \
+ intra_pred_sized(type, 2) \
intra_pred_sized(type, 4) \
intra_pred_sized(type, 8) \
intra_pred_sized(type, 16) \
@@ -846,7 +847,7 @@
intra_pred_highbd_sized(type, 16) \
intra_pred_highbd_sized(type, 32)
-#define intra_pred_no_4x4(type) \
+#define intra_pred_above_4x4(type) \
intra_pred_sized(type, 8) \
intra_pred_sized(type, 16) \
intra_pred_sized(type, 32) \
@@ -857,26 +858,27 @@
#else
#define intra_pred_allsizes(type) \
+ intra_pred_sized(type, 2) \
intra_pred_sized(type, 4) \
intra_pred_sized(type, 8) \
intra_pred_sized(type, 16) \
intra_pred_sized(type, 32)
-#define intra_pred_no_4x4(type) \
+#define intra_pred_above_4x4(type) \
intra_pred_sized(type, 8) \
intra_pred_sized(type, 16) \
intra_pred_sized(type, 32)
#endif // CONFIG_AOM_HIGHBITDEPTH
-intra_pred_no_4x4(d207)
-intra_pred_no_4x4(d63)
-intra_pred_no_4x4(d45)
+intra_pred_above_4x4(d207)
+intra_pred_above_4x4(d63)
+intra_pred_above_4x4(d45)
intra_pred_allsizes(d207e)
intra_pred_allsizes(d63e)
-intra_pred_no_4x4(d45e)
-intra_pred_no_4x4(d117)
-intra_pred_no_4x4(d135)
-intra_pred_no_4x4(d153)
+intra_pred_above_4x4(d45e)
+intra_pred_above_4x4(d117)
+intra_pred_above_4x4(d135)
+intra_pred_above_4x4(d153)
intra_pred_allsizes(v)
intra_pred_allsizes(h)
#if CONFIG_ALT_INTRA
diff --git a/aom_dsp/mips/sad_msa.c b/aom_dsp/mips/sad_msa.c
index 7f7364d..258eb5c 100644
--- a/aom_dsp/mips/sad_msa.c
+++ b/aom_dsp/mips/sad_msa.c
@@ -1435,93 +1435,95 @@
second_pred); \
}
+/* clang-format off */
// 64x64
-AOM_SAD_64xHEIGHT_MSA(64);
-AOM_SAD_64xHEIGHTx3_MSA(64);
-AOM_SAD_64xHEIGHTx8_MSA(64);
-AOM_SAD_64xHEIGHTx4D_MSA(64);
-AOM_AVGSAD_64xHEIGHT_MSA(64);
+AOM_SAD_64xHEIGHT_MSA(64)
+AOM_SAD_64xHEIGHTx3_MSA(64)
+AOM_SAD_64xHEIGHTx8_MSA(64)
+AOM_SAD_64xHEIGHTx4D_MSA(64)
+AOM_AVGSAD_64xHEIGHT_MSA(64)
// 64x32
-AOM_SAD_64xHEIGHT_MSA(32);
-AOM_SAD_64xHEIGHTx3_MSA(32);
-AOM_SAD_64xHEIGHTx8_MSA(32);
-AOM_SAD_64xHEIGHTx4D_MSA(32);
-AOM_AVGSAD_64xHEIGHT_MSA(32);
+AOM_SAD_64xHEIGHT_MSA(32)
+AOM_SAD_64xHEIGHTx3_MSA(32)
+AOM_SAD_64xHEIGHTx8_MSA(32)
+AOM_SAD_64xHEIGHTx4D_MSA(32)
+AOM_AVGSAD_64xHEIGHT_MSA(32)
// 32x64
-AOM_SAD_32xHEIGHT_MSA(64);
-AOM_SAD_32xHEIGHTx3_MSA(64);
-AOM_SAD_32xHEIGHTx8_MSA(64);
-AOM_SAD_32xHEIGHTx4D_MSA(64);
-AOM_AVGSAD_32xHEIGHT_MSA(64);
+AOM_SAD_32xHEIGHT_MSA(64)
+AOM_SAD_32xHEIGHTx3_MSA(64)
+AOM_SAD_32xHEIGHTx8_MSA(64)
+AOM_SAD_32xHEIGHTx4D_MSA(64)
+AOM_AVGSAD_32xHEIGHT_MSA(64)
// 32x32
-AOM_SAD_32xHEIGHT_MSA(32);
-AOM_SAD_32xHEIGHTx3_MSA(32);
-AOM_SAD_32xHEIGHTx8_MSA(32);
-AOM_SAD_32xHEIGHTx4D_MSA(32);
-AOM_AVGSAD_32xHEIGHT_MSA(32);
+AOM_SAD_32xHEIGHT_MSA(32)
+AOM_SAD_32xHEIGHTx3_MSA(32)
+AOM_SAD_32xHEIGHTx8_MSA(32)
+AOM_SAD_32xHEIGHTx4D_MSA(32)
+AOM_AVGSAD_32xHEIGHT_MSA(32)
// 32x16
-AOM_SAD_32xHEIGHT_MSA(16);
-AOM_SAD_32xHEIGHTx3_MSA(16);
-AOM_SAD_32xHEIGHTx8_MSA(16);
-AOM_SAD_32xHEIGHTx4D_MSA(16);
-AOM_AVGSAD_32xHEIGHT_MSA(16);
+AOM_SAD_32xHEIGHT_MSA(16)
+AOM_SAD_32xHEIGHTx3_MSA(16)
+AOM_SAD_32xHEIGHTx8_MSA(16)
+AOM_SAD_32xHEIGHTx4D_MSA(16)
+AOM_AVGSAD_32xHEIGHT_MSA(16)
// 16x32
-AOM_SAD_16xHEIGHT_MSA(32);
-AOM_SAD_16xHEIGHTx3_MSA(32);
-AOM_SAD_16xHEIGHTx8_MSA(32);
-AOM_SAD_16xHEIGHTx4D_MSA(32);
-AOM_AVGSAD_16xHEIGHT_MSA(32);
+AOM_SAD_16xHEIGHT_MSA(32)
+AOM_SAD_16xHEIGHTx3_MSA(32)
+AOM_SAD_16xHEIGHTx8_MSA(32)
+AOM_SAD_16xHEIGHTx4D_MSA(32)
+AOM_AVGSAD_16xHEIGHT_MSA(32)
// 16x16
-AOM_SAD_16xHEIGHT_MSA(16);
-AOM_SAD_16xHEIGHTx3_MSA(16);
-AOM_SAD_16xHEIGHTx8_MSA(16);
-AOM_SAD_16xHEIGHTx4D_MSA(16);
-AOM_AVGSAD_16xHEIGHT_MSA(16);
+AOM_SAD_16xHEIGHT_MSA(16)
+AOM_SAD_16xHEIGHTx3_MSA(16)
+AOM_SAD_16xHEIGHTx8_MSA(16)
+AOM_SAD_16xHEIGHTx4D_MSA(16)
+AOM_AVGSAD_16xHEIGHT_MSA(16)
// 16x8
-AOM_SAD_16xHEIGHT_MSA(8);
-AOM_SAD_16xHEIGHTx3_MSA(8);
-AOM_SAD_16xHEIGHTx8_MSA(8);
-AOM_SAD_16xHEIGHTx4D_MSA(8);
-AOM_AVGSAD_16xHEIGHT_MSA(8);
+AOM_SAD_16xHEIGHT_MSA(8)
+AOM_SAD_16xHEIGHTx3_MSA(8)
+AOM_SAD_16xHEIGHTx8_MSA(8)
+AOM_SAD_16xHEIGHTx4D_MSA(8)
+AOM_AVGSAD_16xHEIGHT_MSA(8)
// 8x16
-AOM_SAD_8xHEIGHT_MSA(16);
-AOM_SAD_8xHEIGHTx3_MSA(16);
-AOM_SAD_8xHEIGHTx8_MSA(16);
-AOM_SAD_8xHEIGHTx4D_MSA(16);
-AOM_AVGSAD_8xHEIGHT_MSA(16);
+AOM_SAD_8xHEIGHT_MSA(16)
+AOM_SAD_8xHEIGHTx3_MSA(16)
+AOM_SAD_8xHEIGHTx8_MSA(16)
+AOM_SAD_8xHEIGHTx4D_MSA(16)
+AOM_AVGSAD_8xHEIGHT_MSA(16)
// 8x8
-AOM_SAD_8xHEIGHT_MSA(8);
-AOM_SAD_8xHEIGHTx3_MSA(8);
-AOM_SAD_8xHEIGHTx8_MSA(8);
-AOM_SAD_8xHEIGHTx4D_MSA(8);
-AOM_AVGSAD_8xHEIGHT_MSA(8);
+AOM_SAD_8xHEIGHT_MSA(8)
+AOM_SAD_8xHEIGHTx3_MSA(8)
+AOM_SAD_8xHEIGHTx8_MSA(8)
+AOM_SAD_8xHEIGHTx4D_MSA(8)
+AOM_AVGSAD_8xHEIGHT_MSA(8)
// 8x4
-AOM_SAD_8xHEIGHT_MSA(4);
-AOM_SAD_8xHEIGHTx3_MSA(4);
-AOM_SAD_8xHEIGHTx8_MSA(4);
-AOM_SAD_8xHEIGHTx4D_MSA(4);
-AOM_AVGSAD_8xHEIGHT_MSA(4);
+AOM_SAD_8xHEIGHT_MSA(4)
+AOM_SAD_8xHEIGHTx3_MSA(4)
+AOM_SAD_8xHEIGHTx8_MSA(4)
+AOM_SAD_8xHEIGHTx4D_MSA(4)
+AOM_AVGSAD_8xHEIGHT_MSA(4)
// 4x8
-AOM_SAD_4xHEIGHT_MSA(8);
-AOM_SAD_4xHEIGHTx3_MSA(8);
-AOM_SAD_4xHEIGHTx8_MSA(8);
-AOM_SAD_4xHEIGHTx4D_MSA(8);
-AOM_AVGSAD_4xHEIGHT_MSA(8);
+AOM_SAD_4xHEIGHT_MSA(8)
+AOM_SAD_4xHEIGHTx3_MSA(8)
+AOM_SAD_4xHEIGHTx8_MSA(8)
+AOM_SAD_4xHEIGHTx4D_MSA(8)
+AOM_AVGSAD_4xHEIGHT_MSA(8)
// 4x4
-AOM_SAD_4xHEIGHT_MSA(4);
-AOM_SAD_4xHEIGHTx3_MSA(4);
-AOM_SAD_4xHEIGHTx8_MSA(4);
-AOM_SAD_4xHEIGHTx4D_MSA(4);
-AOM_AVGSAD_4xHEIGHT_MSA(4);
+AOM_SAD_4xHEIGHT_MSA(4)
+AOM_SAD_4xHEIGHTx3_MSA(4)
+AOM_SAD_4xHEIGHTx8_MSA(4)
+AOM_SAD_4xHEIGHTx4D_MSA(4)
+AOM_AVGSAD_4xHEIGHT_MSA(4)
+ /* clang-format on */
diff --git a/aom_dsp/mips/sub_pixel_variance_msa.c b/aom_dsp/mips/sub_pixel_variance_msa.c
index cfbdb15..3eb8510 100644
--- a/aom_dsp/mips/sub_pixel_variance_msa.c
+++ b/aom_dsp/mips/sub_pixel_variance_msa.c
@@ -1,11 +1,12 @@
/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "./aom_dsp_rtcd.h"
@@ -1652,23 +1653,25 @@
return var; \
}
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
+/* clang-format off */
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64)
+/* clang-format on */
#define AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \
uint32_t aom_sub_pixel_avg_variance##wd##x##ht##_msa( \
@@ -1703,19 +1706,21 @@
return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
}
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
+/* clang-format off */
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4);
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8);
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16);
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8);
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16);
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32);
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16);
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32)
+/* clang-format on */
uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
int32_t src_stride,
@@ -1784,5 +1789,7 @@
return VARIANCE_64Wx##ht##H(*sse, diff); \
}
-AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
-AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);
+/* clang-format off */
+AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32)
+AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64)
+/* clang-format on */
diff --git a/aom_dsp/mips/variance_msa.c b/aom_dsp/mips/variance_msa.c
index 1479363..745fdfc 100644
--- a/aom_dsp/mips/variance_msa.c
+++ b/aom_dsp/mips/variance_msa.c
@@ -540,8 +540,9 @@
return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
}
-AOM_VARIANCE_WDXHT_MSA(4, 4);
-AOM_VARIANCE_WDXHT_MSA(4, 8);
+/* clang-format off */
+AOM_VARIANCE_WDXHT_MSA(4, 4)
+AOM_VARIANCE_WDXHT_MSA(4, 8)
AOM_VARIANCE_WDXHT_MSA(8, 4)
AOM_VARIANCE_WDXHT_MSA(8, 8)
@@ -553,6 +554,7 @@
AOM_VARIANCE_WDXHT_MSA(32, 16)
AOM_VARIANCE_WDXHT_MSA(32, 32)
+/* clang-format on */
uint32_t aom_variance32x64_msa(const uint8_t *src, int32_t src_stride,
const uint8_t *ref, int32_t ref_stride,
diff --git a/aom_dsp/prob.h b/aom_dsp/prob.h
index 4f25b30..cd133e2 100644
--- a/aom_dsp/prob.h
+++ b/aom_dsp/prob.h
@@ -23,6 +23,9 @@
typedef uint8_t aom_prob;
+// TODO(negge): Rename this aom_prob once we remove vpxbool.
+typedef uint16_t aom_cdf_prob;
+
#define MAX_PROB 255
#define aom_prob_half ((aom_prob)128)
diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h
index 4504996..8319f03 100644
--- a/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/aom_dsp/simd/v128_intrinsics_x86.h
@@ -162,7 +162,11 @@
SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
#if defined(__SSSE3__)
+#ifdef __x86_64__
v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL);
+#else
+ v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200);
+#endif
return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
_mm_shuffle_epi8(a, order));
#else
@@ -176,7 +180,11 @@
SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
#if defined(__SSSE3__)
+#ifdef __x86_64__
v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL);
+#else
+ v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100);
+#endif
return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
_mm_shuffle_epi8(a, order));
#else
@@ -262,7 +270,7 @@
SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
v128 r = _mm_madd_epi16(a, b);
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) && defined(__x86_64__)
v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r),
_mm_cvtepi32_epi64(_mm_srli_si128(r, 8)));
return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8)));
diff --git a/aom_dsp/simd/v64_intrinsics_x86.h b/aom_dsp/simd/v64_intrinsics_x86.h
index b951492..bef43c4 100644
--- a/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/aom_dsp/simd/v64_intrinsics_x86.h
@@ -47,7 +47,11 @@
}
SIMD_INLINE v64 v64_from_64(uint64_t x) {
+#ifdef __x86_64__
+ return _mm_cvtsi64_si128(x);
+#else
return _mm_set_epi32(0, 0, x >> 32, (uint32_t)x);
+#endif
}
SIMD_INLINE uint64_t v64_u64(v64 x) {
@@ -168,7 +172,7 @@
SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) {
#if defined(__SSSE3__)
return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
- _mm_cvtsi64_si128(0x0f0d0b0907050301LL));
+ v64_from_64(0x0f0d0b0907050301LL));
#else
return _mm_packus_epi16(
_mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)),
@@ -179,7 +183,7 @@
SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) {
#if defined(__SSSE3__)
return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
- _mm_cvtsi64_si128(0x0e0c0a0806040200LL));
+ v64_from_64(0x0e0c0a0806040200LL));
#else
return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
#endif
@@ -188,7 +192,7 @@
SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) {
#if defined(__SSSE3__)
return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
- _mm_cvtsi64_si128(0x0f0e0b0a07060302LL));
+ v64_from_64(0x0f0e0b0a07060302LL));
#else
return _mm_packs_epi32(
_mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)),
@@ -199,7 +203,7 @@
SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) {
#if defined(__SSSE3__)
return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
- _mm_cvtsi64_si128(0x0d0c090805040100LL));
+ v64_from_64(0x0d0c090805040100LL));
#else
return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
#endif
@@ -261,7 +265,7 @@
SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) {
__m128i r = _mm_madd_epi16(a, b);
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) && defined(__x86_64__)
__m128i x = _mm_cvtepi32_epi64(r);
return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8)));
#else
diff --git a/args.c b/args.c
index e12f16b..5829857 100644
--- a/args.c
+++ b/args.c
@@ -14,6 +14,7 @@
#include <limits.h>
#include "args.h"
+#include "aom/aom_integer.h"
#include "aom_ports/msvc.h"
#if defined(__GNUC__) && __GNUC__
@@ -119,13 +120,13 @@
}
unsigned int arg_parse_uint(const struct arg *arg) {
- long int rawval;
+ uint32_t rawval;
char *endptr;
- rawval = strtol(arg->val, &endptr, 10);
+ rawval = strtoul(arg->val, &endptr, 10);
if (arg->val[0] != '\0' && endptr[0] == '\0') {
- if (rawval >= 0 && rawval <= UINT_MAX) return rawval;
+ if (rawval <= UINT_MAX) return rawval;
die("Option %s: Value %ld out of range for unsigned int\n", arg->name,
rawval);
@@ -136,7 +137,7 @@
}
int arg_parse_int(const struct arg *arg) {
- long int rawval;
+ int32_t rawval;
char *endptr;
rawval = strtol(arg->val, &endptr, 10);
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 5a283a9..9730bee 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -98,6 +98,8 @@
ifeq ($(CONFIG_DERING),yes)
AV1_COMMON_SRCS-yes += common/od_dering.c
AV1_COMMON_SRCS-yes += common/od_dering.h
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/od_dering_sse4.c
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/od_dering_sse4.h
AV1_COMMON_SRCS-yes += common/dering.c
AV1_COMMON_SRCS-yes += common/dering.h
endif
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index be23948..55aee8c 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -20,6 +20,7 @@
struct mv;
union int_mv;
struct yv12_buffer_config;
+typedef int16_t od_dering_in;
EOF
}
forward_decls qw/av1_common_forward_decls/;
@@ -390,9 +391,6 @@
add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht16x16 sse2 avx2/;
-add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-specialize qw/av1_fht32x32/;
-
if (aom_config("CONFIG_EXT_TX") eq "yes") {
add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht4x8 sse2/;
@@ -411,6 +409,9 @@
add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht32x16/;
+
+ add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/av1_fht32x32 avx2/;
}
if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
@@ -840,4 +841,24 @@
}
# end encoder functions
+
+# Deringing Functions
+
+if (aom_config("CONFIG_DERING") eq "yes") {
+ add_proto qw/int od_dir_find8/, "const od_dering_in *img, int stride, int32_t *var, int coeff_shift";
+ specialize qw/od_dir_find8 sse4_1/;
+
+ add_proto qw/int od_filter_dering_direction_4x4/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir";
+ specialize qw/od_filter_dering_direction_4x4 sse4_1/;
+
+ add_proto qw/int od_filter_dering_direction_8x8/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir";
+ specialize qw/od_filter_dering_direction_8x8 sse4_1/;
+
+ add_proto qw/void od_filter_dering_orthogonal_4x4/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir";
+ specialize qw/od_filter_dering_orthogonal_4x4 sse4_1/;
+
+ add_proto qw/void od_filter_dering_orthogonal_8x8/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir";
+ specialize qw/od_filter_dering_orthogonal_8x8 sse4_1/;
+}
+
1;
diff --git a/av1/common/clpf.c b/av1/common/clpf.c
index 1cf5272..a01e6b4 100644
--- a/av1/common/clpf.c
+++ b/av1/common/clpf.c
@@ -14,14 +14,6 @@
#include "aom/aom_image.h"
#include "aom_dsp/aom_dsp_common.h"
-int av1_clpf_maxbits(const AV1_COMMON *cm) {
- return get_msb(
- ALIGN_POWER_OF_TWO(cm->mi_cols * MI_SIZE, cm->clpf_size + 4) *
- ALIGN_POWER_OF_TWO(cm->mi_rows * MI_SIZE, cm->clpf_size + 4) >>
- (cm->clpf_size * 2 + 8)) +
- 1;
-}
-
int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b) {
int delta = 4 * clamp(A - X, -b, b) + clamp(B - X, -b, b) +
3 * clamp(C - X, -b, b) + 3 * clamp(D - X, -b, b) +
@@ -73,14 +65,14 @@
#endif
// Return number of filtered blocks
-int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
- const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
- int enable_fb_flag, unsigned int strength,
- unsigned int fb_size_log2, uint8_t *blocks, int plane,
- int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
- const YV12_BUFFER_CONFIG *,
- const AV1_COMMON *cm, int, int, int,
- unsigned int, unsigned int, uint8_t *)) {
+void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
+ const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
+ int enable_fb_flag, unsigned int strength,
+ unsigned int fb_size_log2, int plane,
+ int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
+ const YV12_BUFFER_CONFIG *,
+ const AV1_COMMON *cm, int, int, int,
+ unsigned int, unsigned int, int8_t *)) {
/* Constrained low-pass filter (CLPF) */
int c, k, l, m, n;
const int subx = plane != AOM_PLANE_Y && frame->subsampling_x;
@@ -95,7 +87,6 @@
int dstride = bs;
const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
- int block_index = 0;
uint8_t *cache = NULL;
uint8_t **cache_ptr = NULL;
uint8_t **cache_dst = NULL;
@@ -125,7 +116,7 @@
for (k = 0; k < num_fb_ver; k++) {
for (l = 0; l < num_fb_hor; l++) {
int h, w;
- int allskip = 1;
+ int allskip = !(enable_fb_flag && fb_size_log2 == MAX_FB_SIZE_LOG2);
const int xoff = l << fb_size_log2;
const int yoff = k << fb_size_log2;
for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) {
@@ -148,8 +139,11 @@
w += !w << fb_size_log2;
if (!allskip && // Do not filter the block if all is skip encoded
(!enable_fb_flag ||
+ // Only called if fb_flag enabled (luma only)
decision(k, l, frame, org, cm, bs, w / bs, h / bs, strength,
- fb_size_log2, blocks + block_index))) {
+ fb_size_log2,
+ cm->clpf_blocks + yoff / MIN_FB_SIZE * cm->clpf_stride +
+ xoff / MIN_FB_SIZE))) {
// Iterate over all smaller blocks inside the filter block
for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
@@ -160,8 +154,9 @@
sizey = AOMMIN(height - ypos, bs);
if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
(xpos << subx) / MI_SIZE]
- ->mbmi.skip) { // Not skip block
- // Temporary buffering needed if filtering in-place
+ ->mbmi.skip ||
+ (enable_fb_flag && fb_size_log2 == MAX_FB_SIZE_LOG2)) {
+ // Temporary buffering needed for in-place filtering
if (cache_ptr[cache_idx]) {
// Copy filtered block back into the frame
#if CONFIG_AOM_HIGHBITDEPTH
@@ -247,7 +242,6 @@
}
}
}
- block_index += !allskip; // Count number of blocks filtered
}
}
@@ -287,6 +281,4 @@
aom_free(cache);
aom_free(cache_ptr);
aom_free(cache_dst);
-
- return block_index;
}
diff --git a/av1/common/clpf.h b/av1/common/clpf.h
index 8e4213b..fc74f2c 100644
--- a/av1/common/clpf.h
+++ b/av1/common/clpf.h
@@ -13,17 +13,19 @@
#include "av1/common/reconinter.h"
-#define MAX_FB_SIZE 128
+#define MAX_FB_SIZE_LOG2 7
+#define MIN_FB_SIZE_LOG2 5
+#define MAX_FB_SIZE (1 << MAX_FB_SIZE_LOG2)
+#define MIN_FB_SIZE (1 << MIN_FB_SIZE_LOG2)
-int av1_clpf_maxbits(const AV1_COMMON *cm);
int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
-int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
- const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
- int enable_fb_flag, unsigned int strength,
- unsigned int fb_size_log2, uint8_t *blocks, int plane,
- int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
- const YV12_BUFFER_CONFIG *,
- const AV1_COMMON *cm, int, int, int,
- unsigned int, unsigned int, uint8_t *));
+void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
+ const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
+ int enable_fb_flag, unsigned int strength,
+ unsigned int fb_size_log2, int plane,
+ int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
+ const YV12_BUFFER_CONFIG *,
+ const AV1_COMMON *cm, int, int, int,
+ unsigned int, unsigned int, int8_t *));
#endif
diff --git a/av1/common/dering.c b/av1/common/dering.c
index 7405fb5..c21d4e5 100644
--- a/av1/common/dering.c
+++ b/av1/common/dering.c
@@ -98,30 +98,28 @@
int nhb, nvb;
nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
+ level = compute_level_from_index(
+ global_level, cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
+ MAX_MIB_SIZE * sbc]
+ ->mbmi.dering_gain);
+ if (level == 0 || sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE))
+ continue;
for (pli = 0; pli < 3; pli++) {
int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
int threshold;
-#if DERING_REFINEMENT
- level = compute_level_from_index(
- global_level,
- cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
- MAX_MIB_SIZE * sbc]
- ->mbmi.dering_gain);
-#else
- level = global_level;
-#endif
/* FIXME: This is a temporary hack that uses more conservative
deringing for chroma. */
- if (pli) level = (level * 5 + 4) >> 3;
- if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) level = 0;
- threshold = level << coeff_shift;
- od_dering(&OD_DERING_VTBL_C, dst, MAX_MIB_SIZE * bsize[pli],
+ if (pli)
+ threshold = (level * 5 + 4) >> 3 << coeff_shift;
+ else
+ threshold = level << coeff_shift;
+ if (threshold == 0) continue;
+ od_dering(dst, MAX_MIB_SIZE * bsize[pli],
&src[pli][sbr * stride * bsize[pli] * MAX_MIB_SIZE +
sbc * bsize[pli] * MAX_MIB_SIZE],
stride, nhb, nvb, sbc, sbr, nhsb, nvsb, dec[pli], dir, pli,
&bskip[MAX_MIB_SIZE * sbr * cm->mi_cols + MAX_MIB_SIZE * sbc],
- cm->mi_cols, threshold, OD_DERING_NO_CHECK_OVERLAP,
- coeff_shift);
+ cm->mi_cols, threshold, coeff_shift);
for (r = 0; r < bsize[pli] * nvb; ++r) {
for (c = 0; c < bsize[pli] * nhb; ++c) {
#if CONFIG_AOM_HIGHBITDEPTH
diff --git a/av1/common/dering.h b/av1/common/dering.h
index 2c1efd7..7c93f8b 100644
--- a/av1/common/dering.h
+++ b/av1/common/dering.h
@@ -24,7 +24,6 @@
#define DERING_LEVEL_BITS 6
#define MAX_DERING_LEVEL (1 << DERING_LEVEL_BITS)
-#define DERING_REFINEMENT 1
#define DERING_REFINEMENT_BITS 2
#define DERING_REFINEMENT_LEVELS 4
diff --git a/av1/common/entropy.c b/av1/common/entropy.c
index c9166db..1defc53 100644
--- a/av1/common/entropy.c
+++ b/av1/common/entropy.c
@@ -418,263 +418,264 @@
// beta = 8
// Values for tokens ONE_TOKEN through CATEGORY6_TOKEN included here.
// ZERO_TOKEN and EOB_TOKEN are coded as flags outside this coder.
-const AnsP10 av1_pareto8_token_probs[COEFF_PROB_MODELS][ENTROPY_TOKENS - 2] = {
- { 4, 4, 4, 4, 8, 15, 30, 57, 103, 795 },
- { 8, 8, 8, 8, 15, 30, 57, 103, 168, 619 },
- { 12, 12, 12, 12, 23, 43, 80, 138, 205, 487 },
- { 16, 16, 15, 15, 30, 56, 101, 165, 225, 385 },
- { 20, 20, 19, 19, 36, 68, 119, 186, 231, 306 },
- { 24, 23, 23, 22, 43, 79, 135, 201, 230, 244 },
- { 28, 27, 26, 26, 49, 89, 149, 211, 223, 196 },
- { 32, 31, 30, 29, 55, 98, 160, 218, 212, 159 },
- { 36, 35, 33, 32, 60, 107, 171, 221, 200, 129 },
- { 40, 38, 37, 35, 66, 115, 179, 222, 187, 105 },
- { 44, 42, 40, 38, 71, 122, 186, 221, 174, 86 },
- { 48, 45, 43, 41, 76, 129, 192, 219, 160, 71 },
- { 52, 49, 46, 44, 80, 136, 196, 215, 148, 58 },
- { 56, 53, 49, 46, 85, 142, 200, 210, 135, 48 },
- { 60, 56, 52, 49, 89, 147, 203, 204, 124, 40 },
- { 64, 60, 55, 52, 93, 151, 205, 198, 113, 33 },
- { 68, 63, 58, 54, 97, 156, 205, 192, 103, 28 },
- { 72, 66, 61, 57, 100, 160, 206, 185, 94, 23 },
- { 76, 70, 64, 59, 104, 163, 205, 178, 85, 20 },
- { 80, 73, 67, 61, 107, 166, 205, 171, 77, 17 },
- { 84, 76, 69, 63, 110, 169, 204, 164, 71, 14 },
- { 88, 80, 72, 65, 113, 171, 202, 157, 64, 12 },
- { 92, 83, 75, 67, 116, 173, 200, 150, 58, 10 },
- { 96, 86, 77, 69, 118, 175, 198, 143, 53, 9 },
- { 100, 89, 80, 71, 121, 176, 195, 137, 48, 7 },
- { 104, 92, 82, 73, 123, 178, 192, 130, 44, 6 },
- { 108, 96, 84, 75, 125, 178, 189, 124, 40, 5 },
- { 112, 98, 87, 76, 127, 179, 186, 118, 36, 5 },
- { 116, 101, 89, 78, 129, 179, 183, 112, 33, 4 },
- { 120, 104, 91, 80, 131, 180, 179, 106, 30, 3 },
- { 124, 107, 93, 81, 132, 180, 176, 101, 27, 3 },
- { 128, 110, 95, 82, 134, 179, 172, 96, 25, 3 },
- { 132, 113, 97, 84, 135, 179, 168, 91, 23, 2 },
- { 136, 116, 99, 85, 136, 179, 164, 86, 21, 2 },
- { 140, 119, 101, 86, 137, 178, 160, 82, 19, 2 },
- { 144, 122, 103, 88, 138, 177, 157, 77, 17, 1 },
- { 148, 124, 105, 89, 139, 176, 153, 73, 16, 1 },
- { 152, 127, 107, 90, 140, 175, 149, 69, 14, 1 },
- { 156, 130, 108, 91, 141, 173, 145, 66, 13, 1 },
- { 160, 133, 110, 92, 141, 172, 141, 62, 12, 1 },
- { 164, 135, 111, 93, 142, 171, 137, 59, 11, 1 },
- { 168, 138, 113, 94, 142, 169, 133, 56, 10, 1 },
- { 172, 140, 115, 94, 142, 168, 130, 53, 9, 1 },
- { 176, 143, 116, 95, 143, 166, 126, 50, 8, 1 },
- { 180, 145, 118, 96, 143, 164, 122, 47, 8, 1 },
- { 184, 147, 119, 96, 143, 163, 119, 45, 7, 1 },
- { 188, 150, 120, 97, 143, 161, 116, 42, 6, 1 },
- { 192, 152, 121, 98, 143, 159, 112, 40, 6, 1 },
- { 196, 155, 123, 98, 142, 157, 109, 38, 5, 1 },
- { 200, 157, 124, 99, 142, 155, 105, 36, 5, 1 },
- { 204, 159, 125, 99, 142, 153, 102, 34, 5, 1 },
- { 208, 161, 126, 100, 142, 151, 99, 32, 4, 1 },
- { 212, 164, 127, 100, 141, 149, 96, 30, 4, 1 },
- { 216, 166, 129, 100, 141, 147, 93, 28, 3, 1 },
- { 220, 168, 130, 101, 140, 144, 90, 27, 3, 1 },
- { 224, 170, 131, 101, 140, 142, 87, 25, 3, 1 },
- { 228, 172, 132, 101, 139, 140, 84, 24, 3, 1 },
- { 232, 174, 132, 101, 139, 138, 81, 23, 3, 1 },
- { 236, 176, 133, 101, 138, 136, 79, 22, 2, 1 },
- { 240, 178, 134, 102, 137, 134, 76, 20, 2, 1 },
- { 244, 180, 135, 102, 136, 131, 74, 19, 2, 1 },
- { 248, 182, 135, 102, 136, 129, 71, 18, 2, 1 },
- { 252, 184, 136, 101, 135, 127, 69, 17, 2, 1 },
- { 256, 186, 137, 102, 134, 124, 66, 16, 2, 1 },
- { 260, 188, 138, 102, 133, 122, 64, 15, 1, 1 },
- { 264, 190, 138, 101, 132, 120, 62, 15, 1, 1 },
- { 268, 191, 139, 101, 131, 118, 60, 14, 1, 1 },
- { 272, 193, 139, 101, 130, 116, 58, 13, 1, 1 },
- { 276, 195, 139, 101, 129, 114, 56, 12, 1, 1 },
- { 280, 196, 140, 101, 128, 111, 54, 12, 1, 1 },
- { 284, 198, 140, 101, 127, 109, 52, 11, 1, 1 },
- { 288, 200, 141, 100, 126, 107, 50, 10, 1, 1 },
- { 292, 201, 141, 100, 125, 105, 48, 10, 1, 1 },
- { 296, 203, 141, 100, 123, 103, 47, 9, 1, 1 },
- { 300, 204, 142, 99, 122, 101, 45, 9, 1, 1 },
- { 304, 206, 142, 99, 121, 99, 43, 8, 1, 1 },
- { 308, 207, 142, 99, 119, 97, 42, 8, 1, 1 },
- { 312, 209, 142, 99, 118, 95, 40, 7, 1, 1 },
- { 316, 210, 142, 98, 117, 93, 39, 7, 1, 1 },
- { 320, 211, 142, 98, 116, 91, 37, 7, 1, 1 },
- { 324, 213, 142, 97, 115, 89, 36, 6, 1, 1 },
- { 328, 214, 142, 97, 113, 87, 35, 6, 1, 1 },
- { 332, 215, 143, 96, 112, 85, 33, 6, 1, 1 },
- { 336, 216, 143, 96, 111, 83, 32, 5, 1, 1 },
- { 340, 218, 143, 95, 109, 81, 31, 5, 1, 1 },
- { 344, 219, 142, 95, 108, 79, 30, 5, 1, 1 },
- { 348, 220, 142, 94, 107, 78, 29, 4, 1, 1 },
- { 352, 221, 142, 94, 105, 76, 28, 4, 1, 1 },
- { 356, 222, 142, 93, 104, 74, 27, 4, 1, 1 },
- { 360, 223, 142, 92, 103, 72, 26, 4, 1, 1 },
- { 364, 224, 142, 92, 101, 70, 25, 4, 1, 1 },
- { 368, 225, 142, 91, 100, 69, 24, 3, 1, 1 },
- { 372, 226, 141, 91, 99, 67, 23, 3, 1, 1 },
- { 376, 227, 141, 90, 97, 66, 22, 3, 1, 1 },
- { 380, 228, 141, 89, 96, 64, 21, 3, 1, 1 },
- { 384, 229, 140, 89, 95, 62, 20, 3, 1, 1 },
- { 388, 229, 140, 88, 93, 61, 20, 3, 1, 1 },
- { 392, 230, 140, 87, 92, 60, 19, 2, 1, 1 },
- { 396, 231, 140, 86, 91, 58, 18, 2, 1, 1 },
- { 400, 232, 139, 86, 89, 57, 17, 2, 1, 1 },
- { 404, 232, 139, 85, 88, 55, 17, 2, 1, 1 },
- { 408, 233, 138, 84, 87, 54, 16, 2, 1, 1 },
- { 412, 234, 138, 84, 85, 52, 15, 2, 1, 1 },
- { 416, 234, 137, 83, 84, 51, 15, 2, 1, 1 },
- { 420, 235, 137, 82, 82, 50, 14, 2, 1, 1 },
- { 424, 236, 136, 81, 81, 48, 14, 2, 1, 1 },
- { 428, 236, 136, 81, 80, 47, 13, 1, 1, 1 },
- { 432, 236, 135, 80, 79, 46, 13, 1, 1, 1 },
- { 436, 237, 135, 79, 77, 45, 12, 1, 1, 1 },
- { 440, 238, 134, 78, 76, 43, 12, 1, 1, 1 },
- { 444, 238, 134, 77, 75, 42, 11, 1, 1, 1 },
- { 448, 238, 133, 77, 73, 41, 11, 1, 1, 1 },
- { 452, 239, 132, 76, 72, 40, 10, 1, 1, 1 },
- { 456, 239, 131, 75, 71, 39, 10, 1, 1, 1 },
- { 460, 239, 131, 74, 70, 38, 9, 1, 1, 1 },
- { 464, 240, 130, 73, 68, 37, 9, 1, 1, 1 },
- { 468, 240, 129, 72, 67, 36, 9, 1, 1, 1 },
- { 472, 240, 128, 72, 66, 35, 8, 1, 1, 1 },
- { 476, 240, 127, 71, 65, 34, 8, 1, 1, 1 },
- { 480, 240, 127, 70, 63, 33, 8, 1, 1, 1 },
- { 484, 241, 126, 69, 62, 32, 7, 1, 1, 1 },
- { 488, 241, 125, 68, 61, 31, 7, 1, 1, 1 },
- { 492, 241, 124, 67, 60, 30, 7, 1, 1, 1 },
- { 496, 241, 124, 66, 59, 29, 6, 1, 1, 1 },
- { 500, 240, 123, 66, 58, 28, 6, 1, 1, 1 },
- { 504, 240, 122, 65, 57, 27, 6, 1, 1, 1 },
- { 508, 240, 121, 64, 55, 27, 6, 1, 1, 1 },
- { 512, 241, 120, 63, 54, 26, 5, 1, 1, 1 },
- { 516, 241, 119, 62, 53, 25, 5, 1, 1, 1 },
- { 520, 240, 118, 62, 52, 24, 5, 1, 1, 1 },
- { 524, 240, 117, 60, 51, 24, 5, 1, 1, 1 },
- { 528, 239, 116, 60, 50, 23, 5, 1, 1, 1 },
- { 532, 239, 116, 59, 49, 22, 4, 1, 1, 1 },
- { 536, 239, 115, 58, 48, 21, 4, 1, 1, 1 },
- { 540, 239, 113, 57, 47, 21, 4, 1, 1, 1 },
- { 544, 238, 113, 56, 46, 20, 4, 1, 1, 1 },
- { 548, 238, 112, 55, 45, 19, 4, 1, 1, 1 },
- { 552, 238, 110, 55, 44, 19, 3, 1, 1, 1 },
- { 556, 237, 110, 54, 43, 18, 3, 1, 1, 1 },
- { 560, 237, 108, 53, 42, 18, 3, 1, 1, 1 },
- { 564, 236, 108, 52, 41, 17, 3, 1, 1, 1 },
- { 568, 236, 106, 51, 40, 17, 3, 1, 1, 1 },
- { 572, 235, 105, 51, 39, 16, 3, 1, 1, 1 },
- { 576, 235, 104, 50, 38, 15, 3, 1, 1, 1 },
- { 580, 234, 103, 49, 37, 15, 3, 1, 1, 1 },
- { 584, 234, 102, 48, 37, 14, 2, 1, 1, 1 },
- { 588, 233, 101, 47, 36, 14, 2, 1, 1, 1 },
- { 592, 233, 100, 46, 35, 13, 2, 1, 1, 1 },
- { 596, 231, 99, 46, 34, 13, 2, 1, 1, 1 },
- { 600, 230, 98, 45, 33, 13, 2, 1, 1, 1 },
- { 604, 230, 97, 44, 32, 12, 2, 1, 1, 1 },
- { 608, 229, 96, 43, 31, 12, 2, 1, 1, 1 },
- { 612, 228, 95, 42, 31, 11, 2, 1, 1, 1 },
- { 616, 227, 93, 42, 30, 11, 2, 1, 1, 1 },
- { 620, 227, 92, 41, 29, 10, 2, 1, 1, 1 },
- { 624, 226, 92, 40, 28, 10, 1, 1, 1, 1 },
- { 628, 225, 90, 39, 28, 10, 1, 1, 1, 1 },
- { 632, 224, 89, 39, 27, 9, 1, 1, 1, 1 },
- { 636, 223, 88, 38, 26, 9, 1, 1, 1, 1 },
- { 640, 222, 87, 37, 25, 9, 1, 1, 1, 1 },
- { 644, 221, 86, 36, 25, 8, 1, 1, 1, 1 },
- { 648, 220, 84, 36, 24, 8, 1, 1, 1, 1 },
- { 652, 219, 83, 35, 23, 8, 1, 1, 1, 1 },
- { 656, 218, 82, 34, 23, 7, 1, 1, 1, 1 },
- { 660, 217, 81, 33, 22, 7, 1, 1, 1, 1 },
- { 664, 215, 80, 33, 21, 7, 1, 1, 1, 1 },
- { 668, 214, 78, 32, 21, 7, 1, 1, 1, 1 },
- { 672, 213, 78, 31, 20, 6, 1, 1, 1, 1 },
- { 676, 211, 76, 31, 20, 6, 1, 1, 1, 1 },
- { 680, 210, 75, 30, 19, 6, 1, 1, 1, 1 },
- { 684, 209, 74, 29, 18, 6, 1, 1, 1, 1 },
- { 688, 208, 73, 28, 18, 5, 1, 1, 1, 1 },
- { 692, 206, 72, 28, 17, 5, 1, 1, 1, 1 },
- { 696, 205, 70, 27, 17, 5, 1, 1, 1, 1 },
- { 700, 203, 69, 27, 16, 5, 1, 1, 1, 1 },
- { 704, 201, 68, 26, 16, 5, 1, 1, 1, 1 },
- { 708, 201, 67, 25, 15, 4, 1, 1, 1, 1 },
- { 712, 198, 66, 25, 15, 4, 1, 1, 1, 1 },
- { 716, 197, 65, 24, 14, 4, 1, 1, 1, 1 },
- { 720, 196, 63, 23, 14, 4, 1, 1, 1, 1 },
- { 724, 194, 62, 23, 13, 4, 1, 1, 1, 1 },
- { 728, 193, 61, 22, 13, 3, 1, 1, 1, 1 },
- { 732, 191, 60, 22, 12, 3, 1, 1, 1, 1 },
- { 736, 189, 59, 21, 12, 3, 1, 1, 1, 1 },
- { 740, 188, 58, 20, 11, 3, 1, 1, 1, 1 },
- { 744, 186, 56, 20, 11, 3, 1, 1, 1, 1 },
- { 748, 184, 55, 19, 11, 3, 1, 1, 1, 1 },
- { 752, 182, 54, 19, 10, 3, 1, 1, 1, 1 },
- { 756, 181, 53, 18, 10, 2, 1, 1, 1, 1 },
- { 760, 179, 52, 18, 9, 2, 1, 1, 1, 1 },
- { 764, 177, 51, 17, 9, 2, 1, 1, 1, 1 },
- { 768, 174, 50, 17, 9, 2, 1, 1, 1, 1 },
- { 772, 173, 49, 16, 8, 2, 1, 1, 1, 1 },
- { 776, 171, 47, 16, 8, 2, 1, 1, 1, 1 },
- { 780, 169, 46, 15, 8, 2, 1, 1, 1, 1 },
- { 784, 167, 45, 15, 7, 2, 1, 1, 1, 1 },
- { 788, 165, 44, 14, 7, 2, 1, 1, 1, 1 },
- { 792, 162, 43, 14, 7, 2, 1, 1, 1, 1 },
- { 796, 161, 42, 13, 7, 1, 1, 1, 1, 1 },
- { 800, 159, 41, 13, 6, 1, 1, 1, 1, 1 },
- { 804, 157, 40, 12, 6, 1, 1, 1, 1, 1 },
- { 808, 154, 39, 12, 6, 1, 1, 1, 1, 1 },
- { 812, 153, 38, 11, 5, 1, 1, 1, 1, 1 },
- { 816, 150, 37, 11, 5, 1, 1, 1, 1, 1 },
- { 820, 148, 36, 10, 5, 1, 1, 1, 1, 1 },
- { 824, 145, 35, 10, 5, 1, 1, 1, 1, 1 },
- { 828, 143, 34, 10, 4, 1, 1, 1, 1, 1 },
- { 832, 141, 33, 9, 4, 1, 1, 1, 1, 1 },
- { 836, 138, 32, 9, 4, 1, 1, 1, 1, 1 },
- { 840, 136, 30, 9, 4, 1, 1, 1, 1, 1 },
- { 844, 133, 30, 8, 4, 1, 1, 1, 1, 1 },
- { 848, 131, 29, 8, 3, 1, 1, 1, 1, 1 },
- { 852, 129, 28, 7, 3, 1, 1, 1, 1, 1 },
- { 856, 126, 27, 7, 3, 1, 1, 1, 1, 1 },
- { 860, 123, 26, 7, 3, 1, 1, 1, 1, 1 },
- { 864, 121, 25, 6, 3, 1, 1, 1, 1, 1 },
- { 868, 118, 24, 6, 3, 1, 1, 1, 1, 1 },
- { 872, 116, 23, 6, 2, 1, 1, 1, 1, 1 },
- { 876, 113, 22, 6, 2, 1, 1, 1, 1, 1 },
- { 880, 111, 21, 5, 2, 1, 1, 1, 1, 1 },
- { 884, 108, 20, 5, 2, 1, 1, 1, 1, 1 },
- { 888, 105, 19, 5, 2, 1, 1, 1, 1, 1 },
- { 892, 102, 19, 4, 2, 1, 1, 1, 1, 1 },
- { 896, 99, 18, 4, 2, 1, 1, 1, 1, 1 },
- { 900, 97, 17, 4, 1, 1, 1, 1, 1, 1 },
- { 904, 94, 16, 4, 1, 1, 1, 1, 1, 1 },
- { 908, 92, 15, 3, 1, 1, 1, 1, 1, 1 },
- { 912, 89, 14, 3, 1, 1, 1, 1, 1, 1 },
- { 916, 85, 14, 3, 1, 1, 1, 1, 1, 1 },
- { 920, 82, 13, 3, 1, 1, 1, 1, 1, 1 },
- { 924, 79, 12, 3, 1, 1, 1, 1, 1, 1 },
- { 928, 77, 11, 2, 1, 1, 1, 1, 1, 1 },
- { 932, 73, 11, 2, 1, 1, 1, 1, 1, 1 },
- { 936, 70, 10, 2, 1, 1, 1, 1, 1, 1 },
- { 940, 67, 9, 2, 1, 1, 1, 1, 1, 1 },
- { 944, 64, 8, 2, 1, 1, 1, 1, 1, 1 },
- { 948, 60, 8, 2, 1, 1, 1, 1, 1, 1 },
- { 952, 58, 7, 1, 1, 1, 1, 1, 1, 1 },
- { 956, 54, 7, 1, 1, 1, 1, 1, 1, 1 },
- { 960, 51, 6, 1, 1, 1, 1, 1, 1, 1 },
- { 964, 48, 5, 1, 1, 1, 1, 1, 1, 1 },
- { 968, 44, 5, 1, 1, 1, 1, 1, 1, 1 },
- { 972, 41, 4, 1, 1, 1, 1, 1, 1, 1 },
- { 976, 37, 4, 1, 1, 1, 1, 1, 1, 1 },
- { 980, 34, 3, 1, 1, 1, 1, 1, 1, 1 },
- { 984, 30, 3, 1, 1, 1, 1, 1, 1, 1 },
- { 988, 27, 2, 1, 1, 1, 1, 1, 1, 1 },
- { 992, 23, 2, 1, 1, 1, 1, 1, 1, 1 },
- { 996, 19, 2, 1, 1, 1, 1, 1, 1, 1 },
- { 1000, 16, 1, 1, 1, 1, 1, 1, 1, 1 },
- { 1004, 12, 1, 1, 1, 1, 1, 1, 1, 1 },
- { 1008, 8, 1, 1, 1, 1, 1, 1, 1, 1 },
- { 1012, 4, 1, 1, 1, 1, 1, 1, 1, 1 },
- { 1015, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
- { 1015, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
-};
+const aom_cdf_prob
+ av1_pareto8_token_probs[COEFF_PROB_MODELS][ENTROPY_TOKENS - 2] = {
+ { 4, 4, 4, 4, 8, 15, 30, 57, 103, 795 },
+ { 8, 8, 8, 8, 15, 30, 57, 103, 168, 619 },
+ { 12, 12, 12, 12, 23, 43, 80, 138, 205, 487 },
+ { 16, 16, 15, 15, 30, 56, 101, 165, 225, 385 },
+ { 20, 20, 19, 19, 36, 68, 119, 186, 231, 306 },
+ { 24, 23, 23, 22, 43, 79, 135, 201, 230, 244 },
+ { 28, 27, 26, 26, 49, 89, 149, 211, 223, 196 },
+ { 32, 31, 30, 29, 55, 98, 160, 218, 212, 159 },
+ { 36, 35, 33, 32, 60, 107, 171, 221, 200, 129 },
+ { 40, 38, 37, 35, 66, 115, 179, 222, 187, 105 },
+ { 44, 42, 40, 38, 71, 122, 186, 221, 174, 86 },
+ { 48, 45, 43, 41, 76, 129, 192, 219, 160, 71 },
+ { 52, 49, 46, 44, 80, 136, 196, 215, 148, 58 },
+ { 56, 53, 49, 46, 85, 142, 200, 210, 135, 48 },
+ { 60, 56, 52, 49, 89, 147, 203, 204, 124, 40 },
+ { 64, 60, 55, 52, 93, 151, 205, 198, 113, 33 },
+ { 68, 63, 58, 54, 97, 156, 205, 192, 103, 28 },
+ { 72, 66, 61, 57, 100, 160, 206, 185, 94, 23 },
+ { 76, 70, 64, 59, 104, 163, 205, 178, 85, 20 },
+ { 80, 73, 67, 61, 107, 166, 205, 171, 77, 17 },
+ { 84, 76, 69, 63, 110, 169, 204, 164, 71, 14 },
+ { 88, 80, 72, 65, 113, 171, 202, 157, 64, 12 },
+ { 92, 83, 75, 67, 116, 173, 200, 150, 58, 10 },
+ { 96, 86, 77, 69, 118, 175, 198, 143, 53, 9 },
+ { 100, 89, 80, 71, 121, 176, 195, 137, 48, 7 },
+ { 104, 92, 82, 73, 123, 178, 192, 130, 44, 6 },
+ { 108, 96, 84, 75, 125, 178, 189, 124, 40, 5 },
+ { 112, 98, 87, 76, 127, 179, 186, 118, 36, 5 },
+ { 116, 101, 89, 78, 129, 179, 183, 112, 33, 4 },
+ { 120, 104, 91, 80, 131, 180, 179, 106, 30, 3 },
+ { 124, 107, 93, 81, 132, 180, 176, 101, 27, 3 },
+ { 128, 110, 95, 82, 134, 179, 172, 96, 25, 3 },
+ { 132, 113, 97, 84, 135, 179, 168, 91, 23, 2 },
+ { 136, 116, 99, 85, 136, 179, 164, 86, 21, 2 },
+ { 140, 119, 101, 86, 137, 178, 160, 82, 19, 2 },
+ { 144, 122, 103, 88, 138, 177, 157, 77, 17, 1 },
+ { 148, 124, 105, 89, 139, 176, 153, 73, 16, 1 },
+ { 152, 127, 107, 90, 140, 175, 149, 69, 14, 1 },
+ { 156, 130, 108, 91, 141, 173, 145, 66, 13, 1 },
+ { 160, 133, 110, 92, 141, 172, 141, 62, 12, 1 },
+ { 164, 135, 111, 93, 142, 171, 137, 59, 11, 1 },
+ { 168, 138, 113, 94, 142, 169, 133, 56, 10, 1 },
+ { 172, 140, 115, 94, 142, 168, 130, 53, 9, 1 },
+ { 176, 143, 116, 95, 143, 166, 126, 50, 8, 1 },
+ { 180, 145, 118, 96, 143, 164, 122, 47, 8, 1 },
+ { 184, 147, 119, 96, 143, 163, 119, 45, 7, 1 },
+ { 188, 150, 120, 97, 143, 161, 116, 42, 6, 1 },
+ { 192, 152, 121, 98, 143, 159, 112, 40, 6, 1 },
+ { 196, 155, 123, 98, 142, 157, 109, 38, 5, 1 },
+ { 200, 157, 124, 99, 142, 155, 105, 36, 5, 1 },
+ { 204, 159, 125, 99, 142, 153, 102, 34, 5, 1 },
+ { 208, 161, 126, 100, 142, 151, 99, 32, 4, 1 },
+ { 212, 164, 127, 100, 141, 149, 96, 30, 4, 1 },
+ { 216, 166, 129, 100, 141, 147, 93, 28, 3, 1 },
+ { 220, 168, 130, 101, 140, 144, 90, 27, 3, 1 },
+ { 224, 170, 131, 101, 140, 142, 87, 25, 3, 1 },
+ { 228, 172, 132, 101, 139, 140, 84, 24, 3, 1 },
+ { 232, 174, 132, 101, 139, 138, 81, 23, 3, 1 },
+ { 236, 176, 133, 101, 138, 136, 79, 22, 2, 1 },
+ { 240, 178, 134, 102, 137, 134, 76, 20, 2, 1 },
+ { 244, 180, 135, 102, 136, 131, 74, 19, 2, 1 },
+ { 248, 182, 135, 102, 136, 129, 71, 18, 2, 1 },
+ { 252, 184, 136, 101, 135, 127, 69, 17, 2, 1 },
+ { 256, 186, 137, 102, 134, 124, 66, 16, 2, 1 },
+ { 260, 188, 138, 102, 133, 122, 64, 15, 1, 1 },
+ { 264, 190, 138, 101, 132, 120, 62, 15, 1, 1 },
+ { 268, 191, 139, 101, 131, 118, 60, 14, 1, 1 },
+ { 272, 193, 139, 101, 130, 116, 58, 13, 1, 1 },
+ { 276, 195, 139, 101, 129, 114, 56, 12, 1, 1 },
+ { 280, 196, 140, 101, 128, 111, 54, 12, 1, 1 },
+ { 284, 198, 140, 101, 127, 109, 52, 11, 1, 1 },
+ { 288, 200, 141, 100, 126, 107, 50, 10, 1, 1 },
+ { 292, 201, 141, 100, 125, 105, 48, 10, 1, 1 },
+ { 296, 203, 141, 100, 123, 103, 47, 9, 1, 1 },
+ { 300, 204, 142, 99, 122, 101, 45, 9, 1, 1 },
+ { 304, 206, 142, 99, 121, 99, 43, 8, 1, 1 },
+ { 308, 207, 142, 99, 119, 97, 42, 8, 1, 1 },
+ { 312, 209, 142, 99, 118, 95, 40, 7, 1, 1 },
+ { 316, 210, 142, 98, 117, 93, 39, 7, 1, 1 },
+ { 320, 211, 142, 98, 116, 91, 37, 7, 1, 1 },
+ { 324, 213, 142, 97, 115, 89, 36, 6, 1, 1 },
+ { 328, 214, 142, 97, 113, 87, 35, 6, 1, 1 },
+ { 332, 215, 143, 96, 112, 85, 33, 6, 1, 1 },
+ { 336, 216, 143, 96, 111, 83, 32, 5, 1, 1 },
+ { 340, 218, 143, 95, 109, 81, 31, 5, 1, 1 },
+ { 344, 219, 142, 95, 108, 79, 30, 5, 1, 1 },
+ { 348, 220, 142, 94, 107, 78, 29, 4, 1, 1 },
+ { 352, 221, 142, 94, 105, 76, 28, 4, 1, 1 },
+ { 356, 222, 142, 93, 104, 74, 27, 4, 1, 1 },
+ { 360, 223, 142, 92, 103, 72, 26, 4, 1, 1 },
+ { 364, 224, 142, 92, 101, 70, 25, 4, 1, 1 },
+ { 368, 225, 142, 91, 100, 69, 24, 3, 1, 1 },
+ { 372, 226, 141, 91, 99, 67, 23, 3, 1, 1 },
+ { 376, 227, 141, 90, 97, 66, 22, 3, 1, 1 },
+ { 380, 228, 141, 89, 96, 64, 21, 3, 1, 1 },
+ { 384, 229, 140, 89, 95, 62, 20, 3, 1, 1 },
+ { 388, 229, 140, 88, 93, 61, 20, 3, 1, 1 },
+ { 392, 230, 140, 87, 92, 60, 19, 2, 1, 1 },
+ { 396, 231, 140, 86, 91, 58, 18, 2, 1, 1 },
+ { 400, 232, 139, 86, 89, 57, 17, 2, 1, 1 },
+ { 404, 232, 139, 85, 88, 55, 17, 2, 1, 1 },
+ { 408, 233, 138, 84, 87, 54, 16, 2, 1, 1 },
+ { 412, 234, 138, 84, 85, 52, 15, 2, 1, 1 },
+ { 416, 234, 137, 83, 84, 51, 15, 2, 1, 1 },
+ { 420, 235, 137, 82, 82, 50, 14, 2, 1, 1 },
+ { 424, 236, 136, 81, 81, 48, 14, 2, 1, 1 },
+ { 428, 236, 136, 81, 80, 47, 13, 1, 1, 1 },
+ { 432, 236, 135, 80, 79, 46, 13, 1, 1, 1 },
+ { 436, 237, 135, 79, 77, 45, 12, 1, 1, 1 },
+ { 440, 238, 134, 78, 76, 43, 12, 1, 1, 1 },
+ { 444, 238, 134, 77, 75, 42, 11, 1, 1, 1 },
+ { 448, 238, 133, 77, 73, 41, 11, 1, 1, 1 },
+ { 452, 239, 132, 76, 72, 40, 10, 1, 1, 1 },
+ { 456, 239, 131, 75, 71, 39, 10, 1, 1, 1 },
+ { 460, 239, 131, 74, 70, 38, 9, 1, 1, 1 },
+ { 464, 240, 130, 73, 68, 37, 9, 1, 1, 1 },
+ { 468, 240, 129, 72, 67, 36, 9, 1, 1, 1 },
+ { 472, 240, 128, 72, 66, 35, 8, 1, 1, 1 },
+ { 476, 240, 127, 71, 65, 34, 8, 1, 1, 1 },
+ { 480, 240, 127, 70, 63, 33, 8, 1, 1, 1 },
+ { 484, 241, 126, 69, 62, 32, 7, 1, 1, 1 },
+ { 488, 241, 125, 68, 61, 31, 7, 1, 1, 1 },
+ { 492, 241, 124, 67, 60, 30, 7, 1, 1, 1 },
+ { 496, 241, 124, 66, 59, 29, 6, 1, 1, 1 },
+ { 500, 240, 123, 66, 58, 28, 6, 1, 1, 1 },
+ { 504, 240, 122, 65, 57, 27, 6, 1, 1, 1 },
+ { 508, 240, 121, 64, 55, 27, 6, 1, 1, 1 },
+ { 512, 241, 120, 63, 54, 26, 5, 1, 1, 1 },
+ { 516, 241, 119, 62, 53, 25, 5, 1, 1, 1 },
+ { 520, 240, 118, 62, 52, 24, 5, 1, 1, 1 },
+ { 524, 240, 117, 60, 51, 24, 5, 1, 1, 1 },
+ { 528, 239, 116, 60, 50, 23, 5, 1, 1, 1 },
+ { 532, 239, 116, 59, 49, 22, 4, 1, 1, 1 },
+ { 536, 239, 115, 58, 48, 21, 4, 1, 1, 1 },
+ { 540, 239, 113, 57, 47, 21, 4, 1, 1, 1 },
+ { 544, 238, 113, 56, 46, 20, 4, 1, 1, 1 },
+ { 548, 238, 112, 55, 45, 19, 4, 1, 1, 1 },
+ { 552, 238, 110, 55, 44, 19, 3, 1, 1, 1 },
+ { 556, 237, 110, 54, 43, 18, 3, 1, 1, 1 },
+ { 560, 237, 108, 53, 42, 18, 3, 1, 1, 1 },
+ { 564, 236, 108, 52, 41, 17, 3, 1, 1, 1 },
+ { 568, 236, 106, 51, 40, 17, 3, 1, 1, 1 },
+ { 572, 235, 105, 51, 39, 16, 3, 1, 1, 1 },
+ { 576, 235, 104, 50, 38, 15, 3, 1, 1, 1 },
+ { 580, 234, 103, 49, 37, 15, 3, 1, 1, 1 },
+ { 584, 234, 102, 48, 37, 14, 2, 1, 1, 1 },
+ { 588, 233, 101, 47, 36, 14, 2, 1, 1, 1 },
+ { 592, 233, 100, 46, 35, 13, 2, 1, 1, 1 },
+ { 596, 231, 99, 46, 34, 13, 2, 1, 1, 1 },
+ { 600, 230, 98, 45, 33, 13, 2, 1, 1, 1 },
+ { 604, 230, 97, 44, 32, 12, 2, 1, 1, 1 },
+ { 608, 229, 96, 43, 31, 12, 2, 1, 1, 1 },
+ { 612, 228, 95, 42, 31, 11, 2, 1, 1, 1 },
+ { 616, 227, 93, 42, 30, 11, 2, 1, 1, 1 },
+ { 620, 227, 92, 41, 29, 10, 2, 1, 1, 1 },
+ { 624, 226, 92, 40, 28, 10, 1, 1, 1, 1 },
+ { 628, 225, 90, 39, 28, 10, 1, 1, 1, 1 },
+ { 632, 224, 89, 39, 27, 9, 1, 1, 1, 1 },
+ { 636, 223, 88, 38, 26, 9, 1, 1, 1, 1 },
+ { 640, 222, 87, 37, 25, 9, 1, 1, 1, 1 },
+ { 644, 221, 86, 36, 25, 8, 1, 1, 1, 1 },
+ { 648, 220, 84, 36, 24, 8, 1, 1, 1, 1 },
+ { 652, 219, 83, 35, 23, 8, 1, 1, 1, 1 },
+ { 656, 218, 82, 34, 23, 7, 1, 1, 1, 1 },
+ { 660, 217, 81, 33, 22, 7, 1, 1, 1, 1 },
+ { 664, 215, 80, 33, 21, 7, 1, 1, 1, 1 },
+ { 668, 214, 78, 32, 21, 7, 1, 1, 1, 1 },
+ { 672, 213, 78, 31, 20, 6, 1, 1, 1, 1 },
+ { 676, 211, 76, 31, 20, 6, 1, 1, 1, 1 },
+ { 680, 210, 75, 30, 19, 6, 1, 1, 1, 1 },
+ { 684, 209, 74, 29, 18, 6, 1, 1, 1, 1 },
+ { 688, 208, 73, 28, 18, 5, 1, 1, 1, 1 },
+ { 692, 206, 72, 28, 17, 5, 1, 1, 1, 1 },
+ { 696, 205, 70, 27, 17, 5, 1, 1, 1, 1 },
+ { 700, 203, 69, 27, 16, 5, 1, 1, 1, 1 },
+ { 704, 201, 68, 26, 16, 5, 1, 1, 1, 1 },
+ { 708, 201, 67, 25, 15, 4, 1, 1, 1, 1 },
+ { 712, 198, 66, 25, 15, 4, 1, 1, 1, 1 },
+ { 716, 197, 65, 24, 14, 4, 1, 1, 1, 1 },
+ { 720, 196, 63, 23, 14, 4, 1, 1, 1, 1 },
+ { 724, 194, 62, 23, 13, 4, 1, 1, 1, 1 },
+ { 728, 193, 61, 22, 13, 3, 1, 1, 1, 1 },
+ { 732, 191, 60, 22, 12, 3, 1, 1, 1, 1 },
+ { 736, 189, 59, 21, 12, 3, 1, 1, 1, 1 },
+ { 740, 188, 58, 20, 11, 3, 1, 1, 1, 1 },
+ { 744, 186, 56, 20, 11, 3, 1, 1, 1, 1 },
+ { 748, 184, 55, 19, 11, 3, 1, 1, 1, 1 },
+ { 752, 182, 54, 19, 10, 3, 1, 1, 1, 1 },
+ { 756, 181, 53, 18, 10, 2, 1, 1, 1, 1 },
+ { 760, 179, 52, 18, 9, 2, 1, 1, 1, 1 },
+ { 764, 177, 51, 17, 9, 2, 1, 1, 1, 1 },
+ { 768, 174, 50, 17, 9, 2, 1, 1, 1, 1 },
+ { 772, 173, 49, 16, 8, 2, 1, 1, 1, 1 },
+ { 776, 171, 47, 16, 8, 2, 1, 1, 1, 1 },
+ { 780, 169, 46, 15, 8, 2, 1, 1, 1, 1 },
+ { 784, 167, 45, 15, 7, 2, 1, 1, 1, 1 },
+ { 788, 165, 44, 14, 7, 2, 1, 1, 1, 1 },
+ { 792, 162, 43, 14, 7, 2, 1, 1, 1, 1 },
+ { 796, 161, 42, 13, 7, 1, 1, 1, 1, 1 },
+ { 800, 159, 41, 13, 6, 1, 1, 1, 1, 1 },
+ { 804, 157, 40, 12, 6, 1, 1, 1, 1, 1 },
+ { 808, 154, 39, 12, 6, 1, 1, 1, 1, 1 },
+ { 812, 153, 38, 11, 5, 1, 1, 1, 1, 1 },
+ { 816, 150, 37, 11, 5, 1, 1, 1, 1, 1 },
+ { 820, 148, 36, 10, 5, 1, 1, 1, 1, 1 },
+ { 824, 145, 35, 10, 5, 1, 1, 1, 1, 1 },
+ { 828, 143, 34, 10, 4, 1, 1, 1, 1, 1 },
+ { 832, 141, 33, 9, 4, 1, 1, 1, 1, 1 },
+ { 836, 138, 32, 9, 4, 1, 1, 1, 1, 1 },
+ { 840, 136, 30, 9, 4, 1, 1, 1, 1, 1 },
+ { 844, 133, 30, 8, 4, 1, 1, 1, 1, 1 },
+ { 848, 131, 29, 8, 3, 1, 1, 1, 1, 1 },
+ { 852, 129, 28, 7, 3, 1, 1, 1, 1, 1 },
+ { 856, 126, 27, 7, 3, 1, 1, 1, 1, 1 },
+ { 860, 123, 26, 7, 3, 1, 1, 1, 1, 1 },
+ { 864, 121, 25, 6, 3, 1, 1, 1, 1, 1 },
+ { 868, 118, 24, 6, 3, 1, 1, 1, 1, 1 },
+ { 872, 116, 23, 6, 2, 1, 1, 1, 1, 1 },
+ { 876, 113, 22, 6, 2, 1, 1, 1, 1, 1 },
+ { 880, 111, 21, 5, 2, 1, 1, 1, 1, 1 },
+ { 884, 108, 20, 5, 2, 1, 1, 1, 1, 1 },
+ { 888, 105, 19, 5, 2, 1, 1, 1, 1, 1 },
+ { 892, 102, 19, 4, 2, 1, 1, 1, 1, 1 },
+ { 896, 99, 18, 4, 2, 1, 1, 1, 1, 1 },
+ { 900, 97, 17, 4, 1, 1, 1, 1, 1, 1 },
+ { 904, 94, 16, 4, 1, 1, 1, 1, 1, 1 },
+ { 908, 92, 15, 3, 1, 1, 1, 1, 1, 1 },
+ { 912, 89, 14, 3, 1, 1, 1, 1, 1, 1 },
+ { 916, 85, 14, 3, 1, 1, 1, 1, 1, 1 },
+ { 920, 82, 13, 3, 1, 1, 1, 1, 1, 1 },
+ { 924, 79, 12, 3, 1, 1, 1, 1, 1, 1 },
+ { 928, 77, 11, 2, 1, 1, 1, 1, 1, 1 },
+ { 932, 73, 11, 2, 1, 1, 1, 1, 1, 1 },
+ { 936, 70, 10, 2, 1, 1, 1, 1, 1, 1 },
+ { 940, 67, 9, 2, 1, 1, 1, 1, 1, 1 },
+ { 944, 64, 8, 2, 1, 1, 1, 1, 1, 1 },
+ { 948, 60, 8, 2, 1, 1, 1, 1, 1, 1 },
+ { 952, 58, 7, 1, 1, 1, 1, 1, 1, 1 },
+ { 956, 54, 7, 1, 1, 1, 1, 1, 1, 1 },
+ { 960, 51, 6, 1, 1, 1, 1, 1, 1, 1 },
+ { 964, 48, 5, 1, 1, 1, 1, 1, 1, 1 },
+ { 968, 44, 5, 1, 1, 1, 1, 1, 1, 1 },
+ { 972, 41, 4, 1, 1, 1, 1, 1, 1, 1 },
+ { 976, 37, 4, 1, 1, 1, 1, 1, 1, 1 },
+ { 980, 34, 3, 1, 1, 1, 1, 1, 1, 1 },
+ { 984, 30, 3, 1, 1, 1, 1, 1, 1, 1 },
+ { 988, 27, 2, 1, 1, 1, 1, 1, 1, 1 },
+ { 992, 23, 2, 1, 1, 1, 1, 1, 1, 1 },
+ { 996, 19, 2, 1, 1, 1, 1, 1, 1, 1 },
+ { 1000, 16, 1, 1, 1, 1, 1, 1, 1, 1 },
+ { 1004, 12, 1, 1, 1, 1, 1, 1, 1, 1 },
+ { 1008, 8, 1, 1, 1, 1, 1, 1, 1, 1 },
+ { 1012, 4, 1, 1, 1, 1, 1, 1, 1, 1 },
+ { 1015, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ { 1015, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ };
#endif // CONFIG_ANS
/* clang-format off */
@@ -2801,15 +2802,13 @@
}
#if CONFIG_ANS
-void av1_build_token_cdfs(const aom_prob *pdf_model, rans_lut cdf) {
- AnsP10 pdf_tab[ENTROPY_TOKENS - 1];
+static void build_token_cdfs(const aom_prob *pdf_model,
+ aom_cdf_prob cdf[ENTROPY_TOKENS]) {
+ int i, sum = 0;
assert(pdf_model[2] != 0);
- // TODO(aconverse): Investigate making the precision of the zero and EOB tree
- // nodes 10-bits.
- aom_rans_merge_prob8_pdf(pdf_tab, pdf_model[1],
- av1_pareto8_token_probs[pdf_model[2] - 1],
- ENTROPY_TOKENS - 2);
- aom_rans_build_cdf_from_pdf(pdf_tab, cdf);
+ for (i = 0; i < ENTROPY_TOKENS - 2; ++i) {
+ cdf[i] = sum += av1_pareto8_token_probs[pdf_model[2] - 1][i];
+ }
}
void av1_coef_pareto_cdfs(FRAME_CONTEXT *fc) {
@@ -2819,9 +2818,10 @@
for (i = 0; i < PLANE_TYPES; ++i)
for (j = 0; j < REF_TYPES; ++j)
for (k = 0; k < COEF_BANDS; ++k)
- for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
- av1_build_token_cdfs(fc->coef_probs[t][i][j][k][l],
- fc->coef_cdfs[t][i][j][k][l]);
+ for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+ build_token_cdfs(fc->coef_probs[t][i][j][k][l],
+ fc->coef_cdfs[t][i][j][k][l]);
+ }
}
#endif // CONFIG_ANS
diff --git a/av1/common/entropy.h b/av1/common/entropy.h
index f0727c0..fd68e82 100644
--- a/av1/common/entropy.h
+++ b/av1/common/entropy.h
@@ -191,10 +191,10 @@
extern const aom_tree_index av1_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)];
extern const aom_prob av1_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
#if CONFIG_ANS
-extern const AnsP10 av1_pareto8_token_probs[COEFF_PROB_MODELS]
- [ENTROPY_TOKENS - 2];
-
-typedef rans_lut coeff_cdf_model[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS];
+typedef aom_cdf_prob coeff_cdf_model[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
+ [ENTROPY_TOKENS];
+extern const aom_cdf_prob av1_pareto8_token_probs[COEFF_PROB_MODELS]
+ [ENTROPY_TOKENS - 2];
#endif // CONFIG_ANS
typedef aom_prob av1_coeff_probs_model[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
diff --git a/av1/common/enums.h b/av1/common/enums.h
index 89a219d..cefed6c 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -246,7 +246,17 @@
PALETTE_COLORS
} PALETTE_COLOR;
+#ifdef CONFIG_CLPF
+#define CLPF_NOFLAG -1
+typedef enum {
+ CLPF_NOSIZE = 0,
+ CLPF_32X32 = 1,
+ CLPF_64X64 = 2,
+ CLPF_128X128 = 3
+} CLPF_BLOCK_SIZE;
+#endif
typedef enum ATTRIBUTE_PACKED {
+
DC_PRED, // Average of above and left pixels
V_PRED, // Vertical
H_PRED, // Horizontal
diff --git a/av1/common/idct.c b/av1/common/idct.c
index 328f360..eedbc79 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -33,6 +33,9 @@
return txsize_sqr_up_map[tx_size] == TX_32X32;
}
+// NOTE: The implementation of all inverses need to be aware of the fact
+// that input and output could be the same buffer.
+
#if CONFIG_EXT_TX
static void iidtx4_c(const tran_low_t *input, tran_low_t *output) {
int i;
@@ -56,17 +59,17 @@
for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
}
-// For use in lieu of DST
+// For use in lieu of ADST
static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
int i;
tran_low_t inputhalf[16];
- for (i = 0; i < 16; ++i) {
- output[i] = input[16 + i] * 4;
- }
// Multiply input by sqrt(2)
for (i = 0; i < 16; ++i) {
inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
}
+ for (i = 0; i < 16; ++i) {
+ output[i] = input[16 + i] * 4;
+ }
idct16_c(inputhalf, output + 16);
// Note overall scaling factor is 4 times orthogonal
}
@@ -106,14 +109,14 @@
int bd) {
int i;
tran_low_t inputhalf[16];
- for (i = 0; i < 16; ++i) {
- output[i] = input[16 + i] * 4;
- }
// Multiply input by sqrt(2)
for (i = 0; i < 16; ++i) {
inputhalf[i] =
HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[i] * Sqrt2), bd);
}
+ for (i = 0; i < 16; ++i) {
+ output[i] = input[16 + i] * 4;
+ }
aom_highbd_idct16_c(inputhalf, output + 16, bd);
// Note overall scaling factor is 4 times orthogonal
}
diff --git a/av1/common/od_dering.c b/av1/common/od_dering.c
index fa59266..7aa704f 100644
--- a/av1/common/od_dering.c
+++ b/av1/common/od_dering.c
@@ -12,14 +12,12 @@
#include "config.h"
#endif
+// clang-format off
+
#include <stdlib.h>
#include <math.h>
#include "dering.h"
-
-const od_dering_opt_vtbl OD_DERING_VTBL_C = {
- { od_filter_dering_direction_4x4_c, od_filter_dering_direction_8x8_c },
- { od_filter_dering_orthogonal_4x4_c, od_filter_dering_orthogonal_8x8_c }
-};
+#include "./av1_rtcd.h"
/* Generated from gen_filter_tables.c. */
const int OD_DIRECTION_OFFSETS_TABLE[8][3] = {
@@ -35,9 +33,6 @@
{ 1 * OD_FILT_BSTRIDE + 0, 2 * OD_FILT_BSTRIDE - 1, 3 * OD_FILT_BSTRIDE - 1 },
};
-const double OD_DERING_GAIN_TABLE[OD_DERING_LEVELS] = { 0, 0.5, 0.707,
- 1, 1.41, 2 };
-
/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
The search minimizes the weighted variance along all the lines in a
particular direction, i.e. the squared error between the input and a
@@ -45,8 +40,8 @@
in a particular direction. Since each direction have the same sum(x^2) term,
that term is never computed. See Section 2, step 2, of:
http://jmvalin.ca/notes/intra_paint.pdf */
-static int od_dir_find8(const od_dering_in *img, int stride, int32_t *var,
- int coeff_shift) {
+int od_dir_find8_c(const od_dering_in *img, int stride, int32_t *var,
+ int coeff_shift) {
int i;
int32_t cost[8] = { 0 };
int partial[8][15] = { { 0 } };
@@ -121,14 +116,15 @@
((OD_BSIZE_MAX + 2 * OD_FILT_BORDER) * (OD_BSIZE_MAX + 2 * OD_FILT_BORDER))
/* Smooth in the direction detected. */
-void od_filter_dering_direction_c(int16_t *y, int ystride, const int16_t *in,
- int ln, int threshold, int dir) {
+int od_filter_dering_direction_8x8_c(int16_t *y, int ystride, const int16_t *in,
+ int threshold, int dir) {
int i;
int j;
int k;
static const int taps[3] = { 3, 2, 1 };
- for (i = 0; i < 1 << ln; i++) {
- for (j = 0; j < 1 << ln; j++) {
+ int total_abs = 0;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
int16_t sum;
int16_t xx;
int16_t yy;
@@ -144,28 +140,53 @@
if (abs(p0) < threshold) sum += taps[k] * p0;
if (abs(p1) < threshold) sum += taps[k] * p1;
}
- yy = xx + ((sum + 8) >> 4);
+ sum = (sum + 8) >> 4;
+ total_abs += abs(sum);
+ yy = xx + sum;
y[i * ystride + j] = yy;
}
}
+ return (total_abs + 8) >> 4;
}
-void od_filter_dering_direction_4x4_c(int16_t *y, int ystride,
- const int16_t *in, int threshold,
- int dir) {
- od_filter_dering_direction_c(y, ystride, in, 2, threshold, dir);
-}
-
-void od_filter_dering_direction_8x8_c(int16_t *y, int ystride,
- const int16_t *in, int threshold,
- int dir) {
- od_filter_dering_direction_c(y, ystride, in, 3, threshold, dir);
+/* Smooth in the direction detected. */
+int od_filter_dering_direction_4x4_c(int16_t *y, int ystride, const int16_t *in,
+ int threshold, int dir) {
+ int i;
+ int j;
+ int k;
+ static const int taps[2] = { 4, 1 };
+ int total_abs = 0;
+ for (i = 0; i < 4; i++) {
+ for (j = 0; j < 4; j++) {
+ int16_t sum;
+ int16_t xx;
+ int16_t yy;
+ xx = in[i * OD_FILT_BSTRIDE + j];
+ sum = 0;
+ for (k = 0; k < 2; k++) {
+ int16_t p0;
+ int16_t p1;
+ p0 = in[i * OD_FILT_BSTRIDE + j + OD_DIRECTION_OFFSETS_TABLE[dir][k]] -
+ xx;
+ p1 = in[i * OD_FILT_BSTRIDE + j - OD_DIRECTION_OFFSETS_TABLE[dir][k]] -
+ xx;
+ if (abs(p0) < threshold) sum += taps[k] * p0;
+ if (abs(p1) < threshold) sum += taps[k] * p1;
+ }
+ sum = (sum + 8) >> 4;
+ total_abs += abs(sum);
+ yy = xx + sum;
+ y[i * ystride + j] = yy;
+ }
+ }
+ return (total_abs + 2) >> 2;
}
/* Smooth in the direction orthogonal to what was detected. */
-void od_filter_dering_orthogonal_c(int16_t *y, int ystride, const int16_t *in,
- const od_dering_in *x, int xstride, int ln,
- int threshold, int dir) {
+void od_filter_dering_orthogonal_8x8_c(int16_t *y, int ystride,
+ const int16_t *in, int threshold,
+ int dir) {
int i;
int j;
int offset;
@@ -173,48 +194,51 @@
offset = OD_FILT_BSTRIDE;
else
offset = 1;
- for (i = 0; i < 1 << ln; i++) {
- for (j = 0; j < 1 << ln; j++) {
- int16_t athresh;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
int16_t yy;
int16_t sum;
int16_t p;
- /* Deringing orthogonal to the direction uses a tighter threshold
- because we want to be conservative. We've presumably already
- achieved some deringing, so the amount of change is expected
- to be low. Also, since we might be filtering across an edge, we
- want to make sure not to blur it. That being said, we might want
- to be a little bit more aggressive on pure horizontal/vertical
- since the ringing there tends to be directional, so it doesn't
- get removed by the directional filtering. */
- athresh = OD_MINI(
- threshold, threshold / 3 +
- abs(in[i * OD_FILT_BSTRIDE + j] - x[i * xstride + j]));
yy = in[i * OD_FILT_BSTRIDE + j];
sum = 0;
p = in[i * OD_FILT_BSTRIDE + j + offset] - yy;
- if (abs(p) < athresh) sum += p;
+ if (abs(p) < threshold) sum += p;
p = in[i * OD_FILT_BSTRIDE + j - offset] - yy;
- if (abs(p) < athresh) sum += p;
+ if (abs(p) < threshold) sum += p;
p = in[i * OD_FILT_BSTRIDE + j + 2 * offset] - yy;
- if (abs(p) < athresh) sum += p;
+ if (abs(p) < threshold) sum += p;
p = in[i * OD_FILT_BSTRIDE + j - 2 * offset] - yy;
- if (abs(p) < athresh) sum += p;
+ if (abs(p) < threshold) sum += p;
y[i * ystride + j] = yy + ((3 * sum + 8) >> 4);
}
}
}
+/* Smooth in the direction orthogonal to what was detected. */
void od_filter_dering_orthogonal_4x4_c(int16_t *y, int ystride,
- const int16_t *in, const od_dering_in *x,
- int xstride, int threshold, int dir) {
- od_filter_dering_orthogonal_c(y, ystride, in, x, xstride, 2, threshold, dir);
-}
-
-void od_filter_dering_orthogonal_8x8_c(int16_t *y, int ystride,
- const int16_t *in, const od_dering_in *x,
- int xstride, int threshold, int dir) {
- od_filter_dering_orthogonal_c(y, ystride, in, x, xstride, 3, threshold, dir);
+ const int16_t *in, int threshold,
+ int dir) {
+ int i;
+ int j;
+ int offset;
+ if (dir > 0 && dir < 4)
+ offset = OD_FILT_BSTRIDE;
+ else
+ offset = 1;
+ for (i = 0; i < 4; i++) {
+ for (j = 0; j < 4; j++) {
+ int16_t yy;
+ int16_t sum;
+ int16_t p;
+ yy = in[i * OD_FILT_BSTRIDE + j];
+ sum = 0;
+ p = in[i * OD_FILT_BSTRIDE + j + offset] - yy;
+ if (abs(p) < threshold) sum += p;
+ p = in[i * OD_FILT_BSTRIDE + j - offset] - yy;
+ if (abs(p) < threshold) sum += p;
+ y[i * ystride + j] = yy + ((5 * sum + 8) >> 4);
+ }
+ }
}
/* This table approximates x^0.16 with the index being log2(x). It is clamped
@@ -225,34 +249,24 @@
327, 365, 408, 455, 509, 569, 635, 710, 768,
};
-/* Compute deringing filter threshold for each 8x8 block based on the
+/* Compute deringing filter threshold for an 8x8 block based on the
directional variance difference. A high variance difference means that we
have a highly directional pattern (e.g. a high contrast edge), so we can
apply more deringing. A low variance means that we either have a low
contrast edge, or a non-directional texture, so we want to be careful not
to blur. */
-static void od_compute_thresh(int thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
- int threshold,
- int32_t var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
- int nhb, int nvb) {
- int bx;
- int by;
- for (by = 0; by < nvb; by++) {
- for (bx = 0; bx < nhb; bx++) {
- int v1;
- /* We use the variance of 8x8 blocks to adjust the threshold. */
- v1 = OD_MINI(32767, var[by][bx] >> 6);
- thresh[by][bx] = (threshold * OD_THRESH_TABLE_Q8[OD_ILOG(v1)] + 128) >> 8;
- }
- }
+static INLINE int od_adjust_thresh(int threshold, int32_t var) {
+ int v1;
+ /* We use the variance of 8x8 blocks to adjust the threshold. */
+ v1 = OD_MINI(32767, var >> 6);
+ return (threshold * OD_THRESH_TABLE_Q8[OD_ILOG(v1)] + 128) >> 8;
}
-void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
- const od_dering_in *x, int xstride, int nhb, int nvb, int sbx,
- int sby, int nhsb, int nvsb, int xdec,
+void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
+ int nhb, int nvb, int sbx, int sby, int nhsb, int nvsb, int xdec,
int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
unsigned char *bskip, int skip_stride, int threshold,
- int overlap, int coeff_shift) {
+ int coeff_shift) {
int i;
int j;
int bx;
@@ -261,7 +275,13 @@
int16_t *in;
int bsize;
int32_t var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
- int thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
+ int filter2_thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
+ od_filter_dering_direction_func filter_dering_direction[OD_DERINGSIZES] = {
+ od_filter_dering_direction_4x4, od_filter_dering_direction_8x8
+ };
+ od_filter_dering_orthogonal_func filter_dering_orthogonal[OD_DERINGSIZES] = {
+ od_filter_dering_orthogonal_4x4, od_filter_dering_orthogonal_8x8
+ };
bsize = 3 - xdec;
in = inbuf + OD_FILT_BORDER * OD_FILT_BSTRIDE + OD_FILT_BORDER;
/* We avoid filtering the pixels for which some of the pixels to average
@@ -275,62 +295,44 @@
in[i * OD_FILT_BSTRIDE + j] = x[i * xstride + j];
}
}
+ /* Assume deringing filter is sparsely applied, so do one large copy rather
+ than small copies later if deringing is skipped. */
+ for (i = 0; i < nvb << bsize; i++) {
+ for (j = 0; j < nhb << bsize; j++) {
+ y[i * ystride + j] = in[i * OD_FILT_BSTRIDE + j];
+ }
+ }
if (pli == 0) {
for (by = 0; by < nvb; by++) {
for (bx = 0; bx < nhb; bx++) {
+ if (bskip[by * skip_stride + bx]) continue;
dir[by][bx] = od_dir_find8(&x[8 * by * xstride + 8 * bx], xstride,
&var[by][bx], coeff_shift);
+ /* Deringing orthogonal to the direction uses a tighter threshold
+ because we want to be conservative. We've presumably already
+ achieved some deringing, so the amount of change is expected
+ to be low. Also, since we might be filtering across an edge, we
+ want to make sure not to blur it. That being said, we might want
+ to be a little bit more aggressive on pure horizontal/vertical
+ since the ringing there tends to be directional, so it doesn't
+ get removed by the directional filtering. */
+ filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
+ &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+ &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
+ od_adjust_thresh(threshold, var[by][bx]), dir[by][bx]);
}
}
- od_compute_thresh(thresh, threshold, var, nhb, nvb);
} else {
for (by = 0; by < nvb; by++) {
for (bx = 0; bx < nhb; bx++) {
- thresh[by][bx] = threshold;
+ if (bskip[by * skip_stride + bx]) continue;
+ filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
+ &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+ &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold,
+ dir[by][bx]);
}
}
}
- for (by = 0; by < nvb; by++) {
- for (bx = 0; bx < nhb; bx++) {
- int skip;
-#if defined(DAALA_ODINTRIN)
- int xstart;
- int ystart;
- int xend;
- int yend;
- xstart = ystart = 0;
- xend = yend = (2 >> xdec);
- if (overlap) {
- xstart -= (sbx != 0);
- ystart -= (sby != 0);
- xend += (sbx != nhsb - 1);
- yend += (sby != nvsb - 1);
- }
- skip = 1;
- /* We look at whether the current block and its 4x4 surrounding (due to
- lapping) are skipped to avoid filtering the same content multiple
- times. */
- for (i = ystart; i < yend; i++) {
- for (j = xstart; j < xend; j++) {
- skip = skip && bskip[((by << 1 >> xdec) + i) * skip_stride +
- (bx << 1 >> xdec) + j];
- }
- }
-#else
- (void)overlap;
- skip = bskip[by * skip_stride + bx];
-#endif
- if (skip) thresh[by][bx] = 0;
- }
- }
- for (by = 0; by < nvb; by++) {
- for (bx = 0; bx < nhb; bx++) {
- (vtbl->filter_dering_direction[bsize - OD_LOG_BSIZE0])(
- &y[(by * ystride << bsize) + (bx << bsize)], ystride,
- &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], thresh[by][bx],
- dir[by][bx]);
- }
- }
for (i = 0; i < nvb << bsize; i++) {
for (j = 0; j < nhb << bsize; j++) {
in[i * OD_FILT_BSTRIDE + j] = y[i * ystride + j];
@@ -338,10 +340,10 @@
}
for (by = 0; by < nvb; by++) {
for (bx = 0; bx < nhb; bx++) {
- (vtbl->filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
+ if (bskip[by * skip_stride + bx] || filter2_thresh[by][bx] == 0) continue;
+ (filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
&y[(by * ystride << bsize) + (bx << bsize)], ystride,
- &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
- &x[(by * xstride << bsize) + (bx << bsize)], xstride, thresh[by][bx],
+ &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], filter2_thresh[by][bx],
dir[by][bx]);
}
}
diff --git a/av1/common/od_dering.h b/av1/common/od_dering.h
index 17fee7d..c64439f 100644
--- a/av1/common/od_dering.h
+++ b/av1/common/od_dering.h
@@ -21,12 +21,6 @@
#define OD_DERINGSIZES (2)
-#define OD_DERING_NO_CHECK_OVERLAP (0)
-#define OD_DERING_CHECK_OVERLAP (1)
-
-#define OD_DERING_LEVELS (6)
-extern const double OD_DERING_GAIN_TABLE[OD_DERING_LEVELS];
-
#define OD_DERING_NBLOCKS (OD_BSIZE_MAX / 8)
#define OD_FILT_BORDER (3)
@@ -34,46 +28,25 @@
extern const int OD_DIRECTION_OFFSETS_TABLE[8][3];
-typedef void (*od_filter_dering_direction_func)(int16_t *y, int ystride,
- const int16_t *in,
- int threshold, int dir);
+typedef int (*od_filter_dering_direction_func)(int16_t *y, int ystride,
+ const int16_t *in, int threshold,
+ int dir);
typedef void (*od_filter_dering_orthogonal_func)(int16_t *y, int ystride,
const int16_t *in,
- const od_dering_in *x,
- int xstride, int threshold,
- int dir);
-
-struct od_dering_opt_vtbl {
- od_filter_dering_direction_func filter_dering_direction[OD_DERINGSIZES];
- od_filter_dering_orthogonal_func filter_dering_orthogonal[OD_DERINGSIZES];
-};
-typedef struct od_dering_opt_vtbl od_dering_opt_vtbl;
-
-void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
- const od_dering_in *x, int xstride, int nvb, int nhb, int sbx,
- int sby, int nhsb, int nvsb, int xdec,
+ int threshold, int dir);
+void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
+ int nvb, int nhb, int sbx, int sby, int nhsb, int nvsb, int xdec,
int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
unsigned char *bskip, int skip_stride, int threshold,
- int overlap, int coeff_shift);
-void od_filter_dering_direction_c(int16_t *y, int ystride, const int16_t *in,
- int ln, int threshold, int dir);
-void od_filter_dering_orthogonal_c(int16_t *y, int ystride, const int16_t *in,
- const od_dering_in *x, int xstride, int ln,
- int threshold, int dir);
-
-extern const od_dering_opt_vtbl OD_DERING_VTBL_C;
-
-void od_filter_dering_direction_4x4_c(int16_t *y, int ystride,
- const int16_t *in, int threshold,
- int dir);
-void od_filter_dering_direction_8x8_c(int16_t *y, int ystride,
- const int16_t *in, int threshold,
- int dir);
+ int coeff_shift);
+int od_filter_dering_direction_4x4_c(int16_t *y, int ystride, const int16_t *in,
+ int threshold, int dir);
+int od_filter_dering_direction_8x8_c(int16_t *y, int ystride, const int16_t *in,
+ int threshold, int dir);
void od_filter_dering_orthogonal_4x4_c(int16_t *y, int ystride,
- const int16_t *in, const od_dering_in *x,
- int xstride, int threshold, int dir);
+ const int16_t *in, int threshold,
+ int dir);
void od_filter_dering_orthogonal_8x8_c(int16_t *y, int ystride,
- const int16_t *in, const od_dering_in *x,
- int xstride, int threshold, int dir);
-
+ const int16_t *in, int threshold,
+ int dir);
#endif
diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index 98f4f51..3a2203a 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h
@@ -151,12 +151,27 @@
int use_highbitdepth;
#endif
#if CONFIG_CLPF
- int clpf_numblocks;
- int clpf_size;
+ // Two bits are used to signal the strength for all blocks and the
+ // valid values are:
+ // 0: no filtering
+ // 1: strength = 1
+ // 2: strength = 2
+ // 3: strength = 4
int clpf_strength_y;
int clpf_strength_u;
int clpf_strength_v;
- uint8_t *clpf_blocks;
+
+ // If clpf_strength_y is not 0, another two bits are used to signal
+ // the filter block size. The valid values for clfp_size are:
+ // 0: no block signalling
+ // 1: 32x32
+ // 2: 64x64
+ // 3: 128x128
+ CLPF_BLOCK_SIZE clpf_size;
+
+ // Buffer for storing whether to filter individual blocks.
+ int8_t *clpf_blocks;
+ int clpf_stride;
#endif
YV12_BUFFER_CONFIG *frame_to_show;
diff --git a/av1/common/x86/od_dering_sse4.c b/av1/common/x86/od_dering_sse4.c
new file mode 100644
index 0000000..80bdba7
--- /dev/null
+++ b/av1/common/x86/od_dering_sse4.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/x86/od_dering_sse4.h"
+
+/* partial A is a 16-bit vector of the form:
+ [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
+ [0 y1 y2 y3 y4 y5 y6 y7].
+ This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
+ (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
+ and const2. */
+static INLINE __m128i fold_mul_and_sum(__m128i partiala, __m128i partialb,
+ __m128i const1, __m128i const2) {
+ __m128i tmp;
+ /* Reverse partial B. */
+ partialb = _mm_shuffle_epi8(
+ partialb,
+ _mm_set_epi8(15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12));
+ /* Interleave the x and y values of identical indices and pair x8 with 0. */
+ tmp = partiala;
+ partiala = _mm_unpacklo_epi16(partiala, partialb);
+ partialb = _mm_unpackhi_epi16(tmp, partialb);
+ /* Square and add the corresponding x and y values. */
+ partiala = _mm_madd_epi16(partiala, partiala);
+ partialb = _mm_madd_epi16(partialb, partialb);
+ /* Multiply by constant. */
+ partiala = _mm_mullo_epi32(partiala, const1);
+ partialb = _mm_mullo_epi32(partialb, const2);
+ /* Sum all results. */
+ partiala = _mm_add_epi32(partiala, partialb);
+ return partiala;
+}
+
+static INLINE __m128i hsum4(__m128i x0, __m128i x1, __m128i x2, __m128i x3) {
+ __m128i t0, t1, t2, t3;
+ t0 = _mm_unpacklo_epi32(x0, x1);
+ t1 = _mm_unpacklo_epi32(x2, x3);
+ t2 = _mm_unpackhi_epi32(x0, x1);
+ t3 = _mm_unpackhi_epi32(x2, x3);
+ x0 = _mm_unpacklo_epi64(t0, t1);
+ x1 = _mm_unpackhi_epi64(t0, t1);
+ x2 = _mm_unpacklo_epi64(t2, t3);
+ x3 = _mm_unpackhi_epi64(t2, t3);
+ return _mm_add_epi32(_mm_add_epi32(x0, x1), _mm_add_epi32(x2, x3));
+}
+
+/* Horizontal sum of 8x16-bit unsigned values. */
+static INLINE int32_t hsum_epi16(__m128i a) {
+ a = _mm_madd_epi16(a, _mm_set1_epi16(1));
+ a = _mm_hadd_epi32(a, a);
+ a = _mm_hadd_epi32(a, a);
+ return _mm_cvtsi128_si32(a);
+}
+
+/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
+ to compute the remaining directions. */
+static INLINE __m128i compute_directions(__m128i lines[8],
+ int32_t tmp_cost1[4]) {
+ __m128i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
+ __m128i partial6;
+ __m128i tmp;
+ /* Partial sums for lines 0 and 1. */
+ partial4a = _mm_slli_si128(lines[0], 14);
+ partial4b = _mm_srli_si128(lines[0], 2);
+ partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[1], 12));
+ partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[1], 4));
+ tmp = _mm_add_epi16(lines[0], lines[1]);
+ partial5a = _mm_slli_si128(tmp, 10);
+ partial5b = _mm_srli_si128(tmp, 6);
+ partial7a = _mm_slli_si128(tmp, 4);
+ partial7b = _mm_srli_si128(tmp, 12);
+ partial6 = tmp;
+
+ /* Partial sums for lines 2 and 3. */
+ partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[2], 10));
+ partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[2], 6));
+ partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[3], 8));
+ partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[3], 8));
+ tmp = _mm_add_epi16(lines[2], lines[3]);
+ partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 8));
+ partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 8));
+ partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 6));
+ partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 10));
+ partial6 = _mm_add_epi16(partial6, tmp);
+
+ /* Partial sums for lines 4 and 5. */
+ partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[4], 6));
+ partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[4], 10));
+ partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[5], 4));
+ partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[5], 12));
+ tmp = _mm_add_epi16(lines[4], lines[5]);
+ partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 6));
+ partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 10));
+ partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 8));
+ partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 8));
+ partial6 = _mm_add_epi16(partial6, tmp);
+
+ /* Partial sums for lines 6 and 7. */
+ partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[6], 2));
+ partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[6], 14));
+ partial4a = _mm_add_epi16(partial4a, lines[7]);
+ tmp = _mm_add_epi16(lines[6], lines[7]);
+ partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 4));
+ partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 12));
+ partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 10));
+ partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 6));
+ partial6 = _mm_add_epi16(partial6, tmp);
+
+ /* Compute costs in terms of partial sums. */
+ partial4a =
+ fold_mul_and_sum(partial4a, partial4b, _mm_set_epi32(210, 280, 420, 840),
+ _mm_set_epi32(105, 120, 140, 168));
+ partial7a =
+ fold_mul_and_sum(partial7a, partial7b, _mm_set_epi32(210, 420, 0, 0),
+ _mm_set_epi32(105, 105, 105, 140));
+ partial5a =
+ fold_mul_and_sum(partial5a, partial5b, _mm_set_epi32(210, 420, 0, 0),
+ _mm_set_epi32(105, 105, 105, 140));
+ partial6 = _mm_madd_epi16(partial6, partial6);
+ partial6 = _mm_mullo_epi32(partial6, _mm_set1_epi32(105));
+
+ partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
+ _mm_storeu_si128((__m128i *)tmp_cost1, partial4a);
+ return partial4a;
+}
+
+/* transpose and reverse the order of the lines -- equivalent to a 90-degree
+ counter-clockwise rotation of the pixels. */
+static INLINE void array_reverse_transpose_8x8(__m128i *in, __m128i *res) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+
+ res[7] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+ res[6] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+ res[5] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+ res[4] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+ res[3] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+ res[2] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+ res[1] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+ res[0] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+}
+
+int od_dir_find8_sse4_1(const od_dering_in *img, int stride, int32_t *var,
+ int coeff_shift) {
+ int i;
+ int32_t cost[8];
+ int32_t best_cost = 0;
+ int best_dir = 0;
+ __m128i lines[8];
+ __m128i dir03, dir47;
+ __m128i max;
+ for (i = 0; i < 8; i++) {
+ lines[i] = _mm_loadu_si128((__m128i *)&img[i * stride]);
+ lines[i] = _mm_sub_epi16(_mm_srai_epi16(lines[i], coeff_shift),
+ _mm_set1_epi16(128));
+ }
+
+ /* Compute "mostly vertical" directions. */
+ dir47 = compute_directions(lines, cost + 4);
+
+ array_reverse_transpose_8x8(lines, lines);
+
+ /* Compute "mostly horizontal" directions. */
+ dir03 = compute_directions(lines, cost);
+
+#if 1
+ max = _mm_max_epi32(dir03, dir47);
+ max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(1, 0, 3, 2)));
+ max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(2, 3, 0, 1)));
+ dir03 = _mm_and_si128(_mm_cmpeq_epi32(max, dir03),
+ _mm_setr_epi32(-1, -2, -3, -4));
+ dir47 = _mm_and_si128(_mm_cmpeq_epi32(max, dir47),
+ _mm_setr_epi32(-5, -6, -7, -8));
+ dir03 = _mm_max_epu32(dir03, dir47);
+ dir03 = _mm_max_epu32(dir03, _mm_unpackhi_epi64(dir03, dir03));
+ dir03 =
+ _mm_max_epu32(dir03, _mm_shufflelo_epi16(dir03, _MM_SHUFFLE(1, 0, 3, 2)));
+ dir03 = _mm_xor_si128(dir03, _mm_set1_epi32(0xFFFFFFFF));
+
+ best_dir = _mm_cvtsi128_si32(dir03);
+ best_cost = _mm_cvtsi128_si32(max);
+#else
+ for (i = 0; i < 8; i++) {
+ if (cost[i] > best_cost) {
+ best_cost = cost[i];
+ best_dir = i;
+ }
+ }
+#endif
+ /* Difference between the optimal variance and the variance along the
+ orthogonal direction. Again, the sum(x^2) terms cancel out. */
+ *var = best_cost - cost[(best_dir + 4) & 7];
+ /* We'd normally divide by 840, but dividing by 1024 is close enough
+ for what we're going to do with this. */
+ *var >>= 10;
+ return best_dir;
+}
+
+static INLINE __m128i od_cmplt_abs_epi16(__m128i in, __m128i threshold) {
+ return _mm_cmplt_epi16(_mm_abs_epi16(in), threshold);
+}
+
+int od_filter_dering_direction_4x4_sse4_1(int16_t *y, int ystride,
+ const int16_t *in, int threshold,
+ int dir) {
+ int i;
+ __m128i sum;
+ __m128i p;
+ __m128i cmp;
+ __m128i row;
+ __m128i res;
+ __m128i tmp;
+ __m128i thresh;
+ __m128i total_abs;
+ int off1, off2;
+ off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
+ off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
+ total_abs = _mm_setzero_si128();
+ thresh = _mm_set1_epi16(threshold);
+ for (i = 0; i < 4; i += 2) {
+ sum = _mm_set1_epi16(0);
+ row = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE]),
+ _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE]));
+
+ /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+ tmp = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE + off1]),
+ _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE + off1]));
+ p = _mm_sub_epi16(tmp, row);
+ /*if (abs(p) < thresh) sum += taps[k]*p*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = _mm_slli_epi16(p, 2);
+ p = _mm_and_si128(p, cmp);
+ sum = _mm_add_epi16(sum, p);
+ /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+ tmp = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE - off1]),
+ _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE - off1]));
+ p = _mm_sub_epi16(tmp, row);
+ /*if (abs(p) < thresh) sum += taps[k]*p1*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = _mm_slli_epi16(p, 2);
+ p = _mm_and_si128(p, cmp);
+ sum = _mm_add_epi16(sum, p);
+
+ /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+ tmp = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE + off2]),
+ _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE + off2]));
+ p = _mm_sub_epi16(tmp, row);
+ /*if (abs(p) < thresh) sum += taps[k]*p*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = _mm_and_si128(p, cmp);
+ sum = _mm_add_epi16(sum, p);
+ /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+ tmp = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE - off2]),
+ _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE - off2]));
+ p = _mm_sub_epi16(tmp, row);
+ /*if (abs(p) < thresh) sum += taps[k]*p1*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = _mm_and_si128(p, cmp);
+ sum = _mm_add_epi16(sum, p);
+
+ /*res = row + ((sum + 8) >> 4)*/
+ res = _mm_add_epi16(sum, _mm_set1_epi16(8));
+ res = _mm_srai_epi16(res, 4);
+ total_abs = _mm_add_epi16(total_abs, _mm_abs_epi16(res));
+ res = _mm_add_epi16(row, res);
+ _mm_storel_epi64((__m128i *)&y[i * ystride], res);
+ _mm_storel_epi64((__m128i *)&y[(i + 1) * ystride],
+ _mm_unpackhi_epi64(res, res));
+ }
+ return (hsum_epi16(total_abs) + 2) >> 2;
+}
+
+int od_filter_dering_direction_8x8_sse4_1(int16_t *y, int ystride,
+ const int16_t *in, int threshold,
+ int dir) {
+ int i;
+ __m128i sum;
+ __m128i p;
+ __m128i cmp;
+ __m128i row;
+ __m128i res;
+ __m128i thresh;
+ __m128i total_abs;
+ int off1, off2, off3;
+ off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
+ off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
+ off3 = OD_DIRECTION_OFFSETS_TABLE[dir][2];
+ total_abs = _mm_setzero_si128();
+ thresh = _mm_set1_epi16(threshold);
+ for (i = 0; i < 8; i++) {
+ sum = _mm_set1_epi16(0);
+ row = _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE]);
+
+ /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+ p = _mm_sub_epi16(
+ _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off1]), row);
+ /*if (abs(p) < thresh) sum += taps[k]*p*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = _mm_add_epi16(p, _mm_slli_epi16(p, 1));
+ p = _mm_and_si128(p, cmp);
+ sum = _mm_add_epi16(sum, p);
+
+ /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+ p = _mm_sub_epi16(
+ _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off1]), row);
+ /*if (abs(p) < thresh) sum += taps[k]*p1*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = _mm_add_epi16(p, _mm_slli_epi16(p, 1));
+ p = _mm_and_si128(p, cmp);
+ sum = _mm_add_epi16(sum, p);
+
+ /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+ p = _mm_sub_epi16(
+ _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off2]), row);
+ /*if (abs(p) < thresh) sum += taps[k]*p*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = _mm_slli_epi16(p, 1);
+ p = _mm_and_si128(p, cmp);
+ sum = _mm_add_epi16(sum, p);
+
+ /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+ p = _mm_sub_epi16(
+ _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off2]), row);
+ /*if (abs(p) < thresh) sum += taps[k]*p1*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = _mm_slli_epi16(p, 1);
+ p = _mm_and_si128(p, cmp);
+ sum = _mm_add_epi16(sum, p);
+
+ /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+ p = _mm_sub_epi16(
+ _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off3]), row);
+ /*if (abs(p) < thresh) sum += taps[k]*p*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = _mm_and_si128(p, cmp);
+ sum = _mm_add_epi16(sum, p);
+
+ /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+ p = _mm_sub_epi16(
+ _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off3]), row);
+ /*if (abs(p) < thresh) sum += taps[k]*p1*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = _mm_and_si128(p, cmp);
+ sum = _mm_add_epi16(sum, p);
+
+ /*res = row + ((sum + 8) >> 4)*/
+ res = _mm_add_epi16(sum, _mm_set1_epi16(8));
+ res = _mm_srai_epi16(res, 4);
+ total_abs = _mm_add_epi16(total_abs, _mm_abs_epi16(res));
+ res = _mm_add_epi16(row, res);
+ _mm_storeu_si128((__m128i *)&y[i * ystride], res);
+ }
+ return (hsum_epi16(total_abs) + 8) >> 4;
+}
+
+void od_filter_dering_orthogonal_4x4_sse4_1(int16_t *y, int ystride,
+ const int16_t *in, int threshold,
+ int dir) {
+ int i;
+ int offset;
+ __m128i res;
+ __m128i p;
+ __m128i cmp;
+ __m128i row;
+ __m128i sum;
+ __m128i tmp;
+ __m128i thresh;
+ thresh = _mm_set1_epi16(threshold);
+ if (dir > 0 && dir < 4)
+ offset = OD_FILT_BSTRIDE;
+ else
+ offset = 1;
+ for (i = 0; i < 4; i += 2) {
+ sum = _mm_set1_epi16(0);
+ row = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE]),
+ _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE]));
+
+ /*p = in[i*OD_FILT_BSTRIDE + k*offset] - row*/
+ tmp = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE + offset]),
+ _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE + offset]));
+ p = _mm_sub_epi16(tmp, row);
+ /*if (abs(p) < threshold) sum += p*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = _mm_and_si128(p, cmp);
+ sum = _mm_add_epi16(sum, p);
+ /*p = in[i*OD_FILT_BSTRIDE - k*offset] - row*/
+ tmp = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE - offset]),
+ _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE - offset]));
+ p = _mm_sub_epi16(tmp, row);
+ /*if (abs(p) < threshold) sum += p*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = _mm_and_si128(p, cmp);
+ sum = _mm_add_epi16(sum, p);
+
+ /*row + ((5*sum + 8) >> 4)*/
+ res = _mm_mullo_epi16(sum, _mm_set1_epi16(5));
+ res = _mm_add_epi16(res, _mm_set1_epi16(8));
+ res = _mm_srai_epi16(res, 4);
+ res = _mm_add_epi16(res, row);
+ _mm_storel_epi64((__m128i *)&y[i * ystride], res);
+ _mm_storel_epi64((__m128i *)&y[(i + 1) * ystride],
+ _mm_unpackhi_epi64(res, res));
+ }
+}
+
+void od_filter_dering_orthogonal_8x8_sse4_1(int16_t *y, int ystride,
+ const int16_t *in, int threshold,
+ int dir) {
+ int i;
+ int offset;
+ __m128i res;
+ __m128i p;
+ __m128i cmp;
+ __m128i row;
+ __m128i sum;
+ __m128i thresh;
+ thresh = _mm_set1_epi16(threshold);
+ if (dir > 0 && dir < 4)
+ offset = OD_FILT_BSTRIDE;
+ else
+ offset = 1;
+ for (i = 0; i < 8; i++) {
+ sum = _mm_set1_epi16(0);
+ row = _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE]);
+
+ /*p = in[i*OD_FILT_BSTRIDE + k*offset] - row*/
+ p = _mm_sub_epi16(
+ _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + 1 * offset]), row);
+ /*if (abs(p) < thresh) sum += p*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = _mm_and_si128(p, cmp);
+ sum = _mm_add_epi16(sum, p);
+ /*p = in[i*OD_FILT_BSTRIDE - k*offset] - row*/
+ p = _mm_sub_epi16(
+ _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - 1 * offset]), row);
+ /*if (abs(p) < threshold) sum += p*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = _mm_and_si128(p, cmp);
+ sum = _mm_add_epi16(sum, p);
+
+ /*p = in[i*OD_FILT_BSTRIDE + k*offset] - row*/
+ p = _mm_sub_epi16(
+ _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + 2 * offset]), row);
+ /*if (abs(p) < threshold) sum += p*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = _mm_and_si128(p, cmp);
+ sum = _mm_add_epi16(sum, p);
+ /*p = in[i*OD_FILT_BSTRIDE - k*offset] - row*/
+ p = _mm_sub_epi16(
+ _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - 2 * offset]), row);
+ /*if (abs(p) < threshold) sum += p*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = _mm_and_si128(p, cmp);
+ sum = _mm_add_epi16(sum, p);
+
+ /*row + ((3*sum + 8) >> 4)*/
+ res = _mm_mullo_epi16(sum, _mm_set1_epi16(3));
+ res = _mm_add_epi16(res, _mm_set1_epi16(8));
+ res = _mm_srai_epi16(res, 4);
+ res = _mm_add_epi16(res, row);
+ _mm_storeu_si128((__m128i *)&y[i * ystride], res);
+ }
+}
diff --git a/av1/common/x86/od_dering_sse4.h b/av1/common/x86/od_dering_sse4.h
new file mode 100644
index 0000000..950ec5f
--- /dev/null
+++ b/av1/common/x86/od_dering_sse4.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/common/od_dering.h"
+#ifndef AOM_COMMON_OD_DERING_X86_SSE4_H_
+#define AOM_COMMON_OD_DERING_X86_SSE4_H_
+#endif // AOM_COMMON_OD_DERING_X86_SSE4_H_
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index e7a0578..acca4cb 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -1440,6 +1440,22 @@
}
}
#endif // CONFIG_SUPERTX
+#if CONFIG_CLPF
+static int clpf_all_skip(const AV1_COMMON *cm, int mi_col, int mi_row,
+ int size) {
+ int r, c;
+ int skip = 1;
+ const int maxc = AOMMIN(size, cm->mi_cols - mi_col);
+ const int maxr = AOMMIN(size, cm->mi_rows - mi_row);
+ for (r = 0; r < maxr && skip; r++) {
+ for (c = 0; c < maxc && skip; c++) {
+ skip &= !!cm->mi_grid_visible[(mi_row + r) * cm->mi_stride + mi_col + c]
+ ->mbmi.skip;
+ }
+ }
+ return skip;
+}
+#endif
// TODO(slavarnway): eliminate bsize and subsize in future commits
static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
@@ -1772,7 +1788,44 @@
if (bsize >= BLOCK_8X8 &&
(bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
-#if DERING_REFINEMENT
+
+#if CONFIG_CLPF
+ if (bsize == BLOCK_64X64 && cm->clpf_strength_y &&
+ cm->clpf_size != CLPF_NOSIZE) {
+ const int tl = mi_row * MI_SIZE / MIN_FB_SIZE * cm->clpf_stride +
+ mi_col * MI_SIZE / MIN_FB_SIZE;
+
+ if (!((mi_row * MI_SIZE) & 127) && !((mi_col * MI_SIZE) & 127) &&
+ cm->clpf_size == CLPF_128X128) {
+ cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR);
+ } else if (cm->clpf_size == CLPF_64X64 &&
+ !clpf_all_skip(cm, mi_col, mi_row, 64 / MI_SIZE)) {
+ cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR);
+ } else if (cm->clpf_size == CLPF_32X32) {
+ const int tr = tl + 1;
+ const int bl = tl + cm->clpf_stride;
+ const int br = tr + cm->clpf_stride;
+ const int size = 32 / MI_SIZE;
+
+ // Up to four bits per SB
+ if (!clpf_all_skip(cm, mi_col, mi_row, size))
+ cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR);
+
+ if (mi_col + size < cm->mi_cols &&
+ !clpf_all_skip(cm, mi_col + size, mi_row, size))
+ cm->clpf_blocks[tr] = aom_read_literal(r, 1, ACCT_STR);
+
+ if (mi_row + size < cm->mi_rows &&
+ !clpf_all_skip(cm, mi_col, mi_row + size, size))
+ cm->clpf_blocks[bl] = aom_read_literal(r, 1, ACCT_STR);
+
+ if (mi_col + size < cm->mi_cols && mi_row + size < cm->mi_rows &&
+ !clpf_all_skip(cm, mi_col + size, mi_row + size, size))
+ cm->clpf_blocks[br] = aom_read_literal(r, 1, ACCT_STR);
+ }
+ }
+#endif
+#if CONFIG_DERING
if (bsize == BLOCK_64X64) {
if (cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) {
cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain =
@@ -1782,7 +1835,7 @@
0;
}
}
-#endif // DERGING_REFINEMENT
+#endif
#endif // CONFIG_EXT_PARTITION_TYPES
}
@@ -2045,20 +2098,26 @@
}
#if CONFIG_CLPF
-static void setup_clpf(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+static void setup_clpf(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
+ AV1_COMMON *const cm = &pbi->common;
+ const int width = pbi->cur_buf->buf.y_crop_width;
+ const int height = pbi->cur_buf->buf.y_crop_height;
+
cm->clpf_blocks = 0;
cm->clpf_strength_y = aom_rb_read_literal(rb, 2);
cm->clpf_strength_u = aom_rb_read_literal(rb, 2);
cm->clpf_strength_v = aom_rb_read_literal(rb, 2);
if (cm->clpf_strength_y) {
cm->clpf_size = aom_rb_read_literal(rb, 2);
- if (cm->clpf_size) {
- int i;
- cm->clpf_numblocks = aom_rb_read_literal(rb, av1_clpf_maxbits(cm));
- CHECK_MEM_ERROR(cm, cm->clpf_blocks, aom_malloc(cm->clpf_numblocks));
- for (i = 0; i < cm->clpf_numblocks; i++) {
- cm->clpf_blocks[i] = aom_rb_read_literal(rb, 1);
- }
+ if (cm->clpf_size != CLPF_NOSIZE) {
+ int size;
+ cm->clpf_stride =
+ ((width + MIN_FB_SIZE - 1) & ~(MIN_FB_SIZE - 1)) >> MIN_FB_SIZE_LOG2;
+ size =
+ cm->clpf_stride * ((height + MIN_FB_SIZE - 1) & ~(MIN_FB_SIZE - 1)) >>
+ MIN_FB_SIZE_LOG2;
+ CHECK_MEM_ERROR(cm, cm->clpf_blocks, aom_malloc(size));
+ memset(cm->clpf_blocks, -1, size);
}
}
}
@@ -2068,7 +2127,7 @@
UNUSED const YV12_BUFFER_CONFIG *org,
UNUSED const AV1_COMMON *cm, UNUSED int block_size,
UNUSED int w, UNUSED int h, UNUSED unsigned int strength,
- UNUSED unsigned int fb_size_log2, uint8_t *bit) {
+ UNUSED unsigned int fb_size_log2, int8_t *bit) {
return *bit;
}
#endif
@@ -3224,11 +3283,10 @@
memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
pbi->need_resync = 0;
}
- if (frame_is_intra_only(cm))
- cm->allow_screen_content_tools = aom_rb_read_bit(rb);
+ cm->allow_screen_content_tools = aom_rb_read_bit(rb);
} else {
cm->intra_only = cm->show_frame ? 0 : aom_rb_read_bit(rb);
-
+ if (cm->intra_only) cm->allow_screen_content_tools = aom_rb_read_bit(rb);
if (cm->error_resilient_mode) {
cm->reset_frame_context = RESET_FRAME_CONTEXT_ALL;
} else {
@@ -3362,7 +3420,7 @@
setup_loopfilter(cm, rb);
#if CONFIG_CLPF
- setup_clpf(cm, rb);
+ setup_clpf(pbi, rb);
#endif
#if CONFIG_DERING
setup_dering(cm, rb);
@@ -3934,18 +3992,18 @@
if (!cm->skip_loop_filter) {
const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf;
if (cm->clpf_strength_y) {
- av1_clpf_frame(frame, NULL, cm, !!cm->clpf_size,
+ av1_clpf_frame(frame, NULL, cm, cm->clpf_size != CLPF_NOSIZE,
cm->clpf_strength_y + (cm->clpf_strength_y == 3),
- 4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, clpf_bit);
+ 4 + cm->clpf_size, AOM_PLANE_Y, clpf_bit);
}
if (cm->clpf_strength_u) {
- av1_clpf_frame(frame, NULL, cm, 0,
- cm->clpf_strength_u + (cm->clpf_strength_u == 3), 4, NULL,
+ av1_clpf_frame(frame, NULL, cm, 0, // No block signals for chroma
+ cm->clpf_strength_u + (cm->clpf_strength_u == 3), 4,
AOM_PLANE_U, NULL);
}
if (cm->clpf_strength_v) {
- av1_clpf_frame(frame, NULL, cm, 0,
- cm->clpf_strength_v + (cm->clpf_strength_v == 3), 4, NULL,
+ av1_clpf_frame(frame, NULL, cm, 0, // No block signals for chroma
+ cm->clpf_strength_v + (cm->clpf_strength_v == 3), 4,
AOM_PLANE_V, NULL);
}
}
diff --git a/av1/decoder/detokenize.c b/av1/decoder/detokenize.c
index b83ab3d..9a40f69 100644
--- a/av1/decoder/detokenize.c
+++ b/av1/decoder/detokenize.c
@@ -39,7 +39,6 @@
if (counts) ++coef_counts[band][ctx][token]; \
} while (0)
-#if !CONFIG_ANS
static INLINE int read_coeff(const aom_prob *probs, int n, aom_reader *r) {
int i, val = 0;
for (i = 0; i < n; ++i) val = (val << 1) | aom_read(r, probs[i]);
@@ -75,6 +74,11 @@
const aom_prob(*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
fc->coef_probs[tx_size_ctx][type][ref];
const aom_prob *prob;
+#if CONFIG_ANS
+ const aom_cdf_prob(*const coef_cdfs)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
+ fc->coef_cdfs[tx_size_ctx][type][ref];
+ const aom_cdf_prob(*cdf)[ENTROPY_TOKENS];
+#endif // CONFIG_ANS
unsigned int(*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1];
unsigned int(*eob_branch_count)[COEFF_CONTEXTS];
uint8_t token_cache[MAX_TX_SQUARE];
@@ -160,7 +164,53 @@
dqv_val = &dq_val[band][0];
#endif // CONFIG_NEW_QUANT
}
-
+#if CONFIG_ANS
+ cdf = &coef_cdfs[band][ctx];
+ token =
+ ONE_TOKEN + aom_read_symbol(r, *cdf, CATEGORY6_TOKEN - ONE_TOKEN + 1);
+ INCREMENT_COUNT(ONE_TOKEN + (token > ONE_TOKEN));
+ switch (token) {
+ case ONE_TOKEN:
+ case TWO_TOKEN:
+ case THREE_TOKEN:
+ case FOUR_TOKEN: val = token; break;
+ case CATEGORY1_TOKEN:
+ val = CAT1_MIN_VAL + read_coeff(cat1_prob, 1, r);
+ break;
+ case CATEGORY2_TOKEN:
+ val = CAT2_MIN_VAL + read_coeff(cat2_prob, 2, r);
+ break;
+ case CATEGORY3_TOKEN:
+ val = CAT3_MIN_VAL + read_coeff(cat3_prob, 3, r);
+ break;
+ case CATEGORY4_TOKEN:
+ val = CAT4_MIN_VAL + read_coeff(cat4_prob, 4, r);
+ break;
+ case CATEGORY5_TOKEN:
+ val = CAT5_MIN_VAL + read_coeff(cat5_prob, 5, r);
+ break;
+ case CATEGORY6_TOKEN: {
+ const int skip_bits = TX_SIZES - 1 - txsize_sqr_up_map[tx_size];
+ const uint8_t *cat6p = cat6_prob + skip_bits;
+#if CONFIG_AOM_HIGHBITDEPTH
+ switch (xd->bd) {
+ case AOM_BITS_8:
+ val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, r);
+ break;
+ case AOM_BITS_10:
+ val = CAT6_MIN_VAL + read_coeff(cat6p, 16 - skip_bits, r);
+ break;
+ case AOM_BITS_12:
+ val = CAT6_MIN_VAL + read_coeff(cat6p, 18 - skip_bits, r);
+ break;
+ default: assert(0); return -1;
+ }
+#else
+ val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, r);
+#endif
+ } break;
+ }
+#else
if (!aom_read(r, prob[ONE_CONTEXT_NODE])) {
INCREMENT_COUNT(ONE_TOKEN);
token = ONE_TOKEN;
@@ -211,8 +261,8 @@
}
}
}
+#endif // CONFIG_ANS
#if CONFIG_NEW_QUANT
-
v = av1_dequant_abscoeff_nuq(val, dqv, dqv_val);
v = dq_shift ? ROUND_POWER_OF_TWO(v, dq_shift) : v;
#else
@@ -240,186 +290,6 @@
return c;
}
-#else // !CONFIG_ANS
-static INLINE int read_coeff(const aom_prob *const probs, int n,
- struct AnsDecoder *const ans) {
- int i, val = 0;
- for (i = 0; i < n; ++i) val = (val << 1) | uabs_read(ans, probs[i]);
- return val;
-}
-
-static int decode_coefs_ans(const MACROBLOCKD *const xd, PLANE_TYPE type,
- tran_low_t *dqcoeff, TX_SIZE tx_size,
- TX_TYPE tx_type, const int16_t *dq,
-#if CONFIG_NEW_QUANT
- dequant_val_type_nuq *dq_val,
-#endif // CONFIG_NEW_QUANT
- int ctx, const int16_t *scan, const int16_t *nb,
- struct AnsDecoder *const ans) {
- FRAME_COUNTS *counts = xd->counts;
- const int max_eob = get_tx2d_size(tx_size);
- const FRAME_CONTEXT *const fc = xd->fc;
- const int ref = is_inter_block(&xd->mi[0]->mbmi);
- int band, c = 0;
- int skip_eob = 0;
- const int tx_size_ctx = txsize_sqr_map[tx_size];
- const aom_prob(*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
- fc->coef_probs[tx_size_ctx][type][ref];
- const rans_lut(*coef_cdfs)[COEFF_CONTEXTS] =
- fc->coef_cdfs[tx_size_ctx][type][ref];
- const aom_prob *prob;
- const rans_lut *cdf;
- unsigned int(*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1];
- unsigned int(*eob_branch_count)[COEFF_CONTEXTS];
- uint8_t token_cache[MAX_TX_SQUARE];
- const uint8_t *band_translate = get_band_translate(tx_size);
- int dq_shift;
- int v, token;
- int16_t dqv = dq[0];
-#if CONFIG_NEW_QUANT
- const tran_low_t *dqv_val = &dq_val[0][0];
-#endif // CONFIG_NEW_QUANT
- const uint8_t *cat1_prob;
- const uint8_t *cat2_prob;
- const uint8_t *cat3_prob;
- const uint8_t *cat4_prob;
- const uint8_t *cat5_prob;
- const uint8_t *cat6_prob;
-
- dq_shift = get_tx_scale(xd, tx_type, tx_size);
-
- if (counts) {
- coef_counts = counts->coef[tx_size_ctx][type][ref];
- eob_branch_count = counts->eob_branch[tx_size_ctx][type][ref];
- }
-
-#if CONFIG_AOM_HIGHBITDEPTH
- if (xd->bd > AOM_BITS_8) {
- if (xd->bd == AOM_BITS_10) {
- cat1_prob = av1_cat1_prob_high10;
- cat2_prob = av1_cat2_prob_high10;
- cat3_prob = av1_cat3_prob_high10;
- cat4_prob = av1_cat4_prob_high10;
- cat5_prob = av1_cat5_prob_high10;
- cat6_prob = av1_cat6_prob_high10;
- } else {
- cat1_prob = av1_cat1_prob_high12;
- cat2_prob = av1_cat2_prob_high12;
- cat3_prob = av1_cat3_prob_high12;
- cat4_prob = av1_cat4_prob_high12;
- cat5_prob = av1_cat5_prob_high12;
- cat6_prob = av1_cat6_prob_high12;
- }
- } else {
- cat1_prob = av1_cat1_prob;
- cat2_prob = av1_cat2_prob;
- cat3_prob = av1_cat3_prob;
- cat4_prob = av1_cat4_prob;
- cat5_prob = av1_cat5_prob;
- cat6_prob = av1_cat6_prob;
- }
-#else
- cat1_prob = av1_cat1_prob;
- cat2_prob = av1_cat2_prob;
- cat3_prob = av1_cat3_prob;
- cat4_prob = av1_cat4_prob;
- cat5_prob = av1_cat5_prob;
- cat6_prob = av1_cat6_prob;
-#endif
-
- while (c < max_eob) {
- int val = -1;
- band = *band_translate++;
- prob = coef_probs[band][ctx];
- if (!skip_eob) {
- if (counts) ++eob_branch_count[band][ctx];
- if (!uabs_read(ans, prob[EOB_CONTEXT_NODE])) {
- INCREMENT_COUNT(EOB_MODEL_TOKEN);
- break;
- }
- }
-
-#if CONFIG_NEW_QUANT
- dqv_val = &dq_val[band][0];
-#endif // CONFIG_NEW_QUANT
-
- cdf = &coef_cdfs[band][ctx];
- token = ZERO_TOKEN + rans_read(ans, *cdf);
- if (token == ZERO_TOKEN) {
- INCREMENT_COUNT(ZERO_TOKEN);
- token_cache[scan[c]] = 0;
- skip_eob = 1;
- } else {
- INCREMENT_COUNT(ONE_TOKEN + (token > ONE_TOKEN));
- switch (token) {
- case ONE_TOKEN:
- case TWO_TOKEN:
- case THREE_TOKEN:
- case FOUR_TOKEN: val = token; break;
- case CATEGORY1_TOKEN:
- val = CAT1_MIN_VAL + read_coeff(cat1_prob, 1, ans);
- break;
- case CATEGORY2_TOKEN:
- val = CAT2_MIN_VAL + read_coeff(cat2_prob, 2, ans);
- break;
- case CATEGORY3_TOKEN:
- val = CAT3_MIN_VAL + read_coeff(cat3_prob, 3, ans);
- break;
- case CATEGORY4_TOKEN:
- val = CAT4_MIN_VAL + read_coeff(cat4_prob, 4, ans);
- break;
- case CATEGORY5_TOKEN:
- val = CAT5_MIN_VAL + read_coeff(cat5_prob, 5, ans);
- break;
- case CATEGORY6_TOKEN: {
- const int skip_bits = TX_SIZES - 1 - txsize_sqr_up_map[tx_size];
- const uint8_t *cat6p = cat6_prob + skip_bits;
-#if CONFIG_AOM_HIGHBITDEPTH
- switch (xd->bd) {
- case AOM_BITS_8:
- val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, ans);
- break;
- case AOM_BITS_10:
- val = CAT6_MIN_VAL + read_coeff(cat6p, 16 - skip_bits, ans);
- break;
- case AOM_BITS_12:
- val = CAT6_MIN_VAL + read_coeff(cat6p, 18 - skip_bits, ans);
- break;
- default: assert(0); return -1;
- }
-#else
- val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, ans);
-#endif
- } break;
- }
-#if CONFIG_NEW_QUANT
- v = av1_dequant_abscoeff_nuq(val, dqv, dqv_val);
- v = dq_shift ? ROUND_POWER_OF_TWO(v, dq_shift) : v;
-#else
- v = (val * dqv) >> dq_shift;
-#endif // CONFIG_NEW_QUANT
-
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-#if CONFIG_AOM_HIGHBITDEPTH
- dqcoeff[scan[c]] =
- highbd_check_range((uabs_read_bit(ans) ? -v : v), xd->bd);
-#else
- dqcoeff[scan[c]] = check_range(uabs_read_bit(ans) ? -v : v);
-#endif // CONFIG_AOM_HIGHBITDEPTH
-#else
- dqcoeff[scan[c]] = uabs_read_bit(ans) ? -v : v;
-#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
- token_cache[scan[c]] = av1_pt_energy_class[token];
- skip_eob = 0;
- }
- ++c;
- ctx = get_coef_context(nb, token_cache, c);
- dqv = dq[1];
- }
-
- return c;
-}
-#endif // !CONFIG_ANS
// TODO(slavarnway): Decode version of av1_set_context. Modify
// av1_set_context
@@ -510,7 +380,6 @@
get_dq_profile_from_ctx(xd->qindex[seg_id], ctx, ref, pd->plane_type);
#endif // CONFIG_NEW_QUANT
-#if !CONFIG_ANS
#if CONFIG_AOM_QM
const int eob =
decode_coefs(xd, pd->plane_type, pd->dqcoeff, tx_size, tx_type, dequant,
@@ -523,14 +392,6 @@
#endif // CONFIG_NEW_QUANT
ctx, sc->scan, sc->neighbors, r);
#endif // CONFIG_AOM_QM
-#else
- const int eob = decode_coefs_ans(xd, pd->plane_type, pd->dqcoeff, tx_size,
- tx_type, dequant,
-#if CONFIG_NEW_QUANT
- pd->seg_dequant_nuq[seg_id][dq],
-#endif // CONFIG_NEW_QUANT
- ctx, sc->scan, sc->neighbors, r);
-#endif // !CONFIG_ANS
dec_set_contexts(xd, pd, tx_size, eob > 0, x, y);
/*
av1_set_contexts(xd, pd,
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index f4b5da6..80da661 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -620,7 +620,6 @@
}
#endif // CONFIG_SUPERTX
-#if !CONFIG_ANS
static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp,
const TOKENEXTRA *const stop,
aom_bit_depth_t bit_depth, const TX_SIZE tx) {
@@ -632,9 +631,11 @@
while (p < stop && p->token != EOSB_TOKEN) {
const int t = p->token;
+#if !CONFIG_ANS
const struct av1_token *const a = &av1_coef_encodings[t];
int v = a->value;
int n = a->len;
+#endif // !CONFIG_ANS
#if CONFIG_AOM_HIGHBITDEPTH
const av1_extra_bit *b;
if (bit_depth == AOM_BITS_12)
@@ -648,6 +649,19 @@
(void)bit_depth;
#endif // CONFIG_AOM_HIGHBITDEPTH
+#if CONFIG_ANS
+ /* skip one or two nodes */
+ if (!p->skip_eob_node) aom_write(w, t != EOB_TOKEN, p->context_tree[0]);
+
+ if (t != EOB_TOKEN) {
+ aom_write(w, t != ZERO_TOKEN, p->context_tree[1]);
+
+ if (t != ZERO_TOKEN) {
+ aom_write_symbol(w, t - ONE_TOKEN, *p->token_cdf,
+ CATEGORY6_TOKEN - ONE_TOKEN + 1);
+ }
+ }
+#else
/* skip one or two nodes */
if (p->skip_eob_node)
n -= p->skip_eob_node;
@@ -668,6 +682,7 @@
}
}
}
+#endif // CONFIG_ANS
if (b->base_val) {
const int e = p->extra, l = b->len;
@@ -705,83 +720,6 @@
*tp = p;
}
-#else
-// This function serializes the tokens in forward order using a buffered ans
-// coder.
-static void pack_mb_tokens(struct BufAnsCoder *ans, const TOKENEXTRA **tp,
- const TOKENEXTRA *const stop,
- aom_bit_depth_t bit_depth, const TX_SIZE tx) {
- const TOKENEXTRA *p = *tp;
-#if CONFIG_VAR_TX
- int count = 0;
- const int seg_eob = 16 << (tx << 1);
-#endif // CONFIG_VAR_TX
-
- while (p < stop && p->token != EOSB_TOKEN) {
- const int t = p->token;
-#if CONFIG_AOM_HIGHBITDEPTH
- const av1_extra_bit *b;
- if (bit_depth == AOM_BITS_12)
- b = &av1_extra_bits_high12[t];
- else if (bit_depth == AOM_BITS_10)
- b = &av1_extra_bits_high10[t];
- else
- b = &av1_extra_bits[t];
-#else
- const av1_extra_bit *const b = &av1_extra_bits[t];
- (void)bit_depth;
-#endif // CONFIG_AOM_HIGHBITDEPTH
-
- /* skip one or two nodes */
- if (!p->skip_eob_node)
- buf_uabs_write(ans, t != EOB_TOKEN, p->context_tree[0]);
-
- if (t != EOB_TOKEN) {
- struct rans_sym s;
- const rans_lut *token_cdf = p->token_cdf;
- assert(token_cdf);
- s.cum_prob = (*token_cdf)[t - ZERO_TOKEN];
- s.prob = (*token_cdf)[t - ZERO_TOKEN + 1] - s.cum_prob;
- buf_rans_write(ans, &s);
-
- if (b->base_val) {
- const int e = p->extra, l = b->len;
- int skip_bits = (b->base_val == CAT6_MIN_VAL)
- ? TX_SIZES - 1 - txsize_sqr_up_map[tx]
- : 0;
-
- if (l) {
- const unsigned char *pb = b->prob;
- int v = e >> 1;
- int n = l; /* number of bits in v, assumed nonzero */
- int i = 0;
-
- do {
- const int bb = (v >> --n) & 1;
- if (skip_bits) {
- skip_bits--;
- assert(!bb);
- } else {
- buf_uabs_write(ans, bb, pb[i >> 1]);
- }
- i = b->tree[i + bb];
- } while (n);
- }
-
- buf_uabs_write(ans, e & 1, 128);
- }
- }
- ++p;
-
-#if CONFIG_VAR_TX
- ++count;
- if (t == EOB_TOKEN || count == seg_eob) break;
-#endif // CONFIG_VAR_TX
- }
-
- *tp = p;
-}
-#endif // !CONFIG_ANS
#if CONFIG_VAR_TX
static void pack_txb_tokens(aom_writer *w, const TOKENEXTRA **tp,
@@ -1932,7 +1870,38 @@
(bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
update_partition_context(xd, mi_row, mi_col, subsize, bsize);
-#if DERING_REFINEMENT
+#if CONFIG_CLPF
+ if (bsize == BLOCK_64X64 && cm->clpf_blocks && cm->clpf_strength_y &&
+ cm->clpf_size != CLPF_NOSIZE) {
+ const int tl = mi_row * MI_SIZE / MIN_FB_SIZE * cm->clpf_stride +
+ mi_col * MI_SIZE / MIN_FB_SIZE;
+ const int tr = tl + 1;
+ const int bl = tl + cm->clpf_stride;
+ const int br = tr + cm->clpf_stride;
+
+ // Up to four bits per SB.
+ // When clpf_size indicates a size larger than the SB size
+ // (CLPF_128X128), one bit for every fourth SB will be transmitted
+ // regardless of skip blocks.
+ if (cm->clpf_blocks[tl] != CLPF_NOFLAG)
+ aom_write_literal(w, cm->clpf_blocks[tl], 1);
+
+ if (mi_col + MI_SIZE / 2 < cm->mi_cols &&
+ cm->clpf_blocks[tr] != CLPF_NOFLAG)
+ aom_write_literal(w, cm->clpf_blocks[tr], 1);
+
+ if (mi_row + MI_SIZE / 2 < cm->mi_rows &&
+ cm->clpf_blocks[bl] != CLPF_NOFLAG)
+ aom_write_literal(w, cm->clpf_blocks[bl], 1);
+
+ if (mi_row + MI_SIZE / 2 < cm->mi_rows &&
+ mi_col + MI_SIZE / 2 < cm->mi_cols &&
+ cm->clpf_blocks[br] != CLPF_NOFLAG)
+ aom_write_literal(w, cm->clpf_blocks[br], 1);
+ }
+#endif
+
+#if CONFIG_DERING
if (bsize == BLOCK_64X64 && cm->dering_level != 0 &&
!sb_all_skip(cm, mi_row, mi_col)) {
aom_write_literal(
@@ -2596,18 +2565,6 @@
aom_wb_write_literal(wb, cm->clpf_strength_v, 2);
if (cm->clpf_strength_y) {
aom_wb_write_literal(wb, cm->clpf_size, 2);
- if (cm->clpf_size) {
- int i;
- // TODO(stemidts): The number of bits to transmit could be
- // implicitly deduced if transmitted after the filter block or
- // after the frame (when it's known whether the block is all
- // skip and implicitly unfiltered). And the bits do not have
- // 50% probability, so a more efficient coding is possible.
- aom_wb_write_literal(wb, cm->clpf_numblocks, av1_clpf_maxbits(cm));
- for (i = 0; i < cm->clpf_numblocks; i++) {
- aom_wb_write_literal(wb, cm->clpf_blocks ? cm->clpf_blocks[i] : 0, 1);
- }
- }
}
}
#endif
@@ -3199,11 +3156,10 @@
write_sync_code(wb);
write_bitdepth_colorspace_sampling(cm, wb);
write_frame_size(cm, wb);
- if (frame_is_intra_only(cm))
- aom_wb_write_bit(wb, cm->allow_screen_content_tools);
+ aom_wb_write_bit(wb, cm->allow_screen_content_tools);
} else {
if (!cm->show_frame) aom_wb_write_bit(wb, cm->intra_only);
-
+ if (cm->intra_only) aom_wb_write_bit(wb, cm->allow_screen_content_tools);
if (!cm->error_resilient_mode) {
if (cm->intra_only) {
aom_wb_write_bit(wb,
diff --git a/av1/encoder/clpf_rdo.c b/av1/encoder/clpf_rdo.c
index 1d498f1..4e652b6 100644
--- a/av1/encoder/clpf_rdo.c
+++ b/av1/encoder/clpf_rdo.c
@@ -127,14 +127,15 @@
int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
int block_size, int w, int h, unsigned int strength,
- unsigned int fb_size_log2, uint8_t *res) {
+ unsigned int fb_size_log2, int8_t *res) {
int m, n, sum0 = 0, sum1 = 0;
for (m = 0; m < h; m++) {
for (n = 0; n < w; n++) {
int xpos = (l << fb_size_log2) + n * block_size;
int ypos = (k << fb_size_log2) + m * block_size;
- if (!cm->mi_grid_visible[ypos / MI_SIZE * cm->mi_stride + xpos / MI_SIZE]
+ if (fb_size_log2 == MAX_FB_SIZE_LOG2 ||
+ !cm->mi_grid_visible[ypos / MI_SIZE * cm->mi_stride + xpos / MI_SIZE]
->mbmi.skip) {
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
@@ -167,6 +168,8 @@
// (Only for luma:)
// res[1][0] : (bit count, fb size = 128)
// res[1][1-3] : strength=1,2,4, fb size = 128
+// res[1][4] : unfiltered, including skip
+// res[1][5-7] : strength=1,2,4, including skip, fb_size = 128
// res[2][0] : (bit count, fb size = 64)
// res[2][1-3] : strength=1,2,4, fb size = 64
// res[3][0] : (bit count, fb size = 32)
@@ -174,9 +177,9 @@
static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
unsigned int block_size, unsigned int fb_size_log2, int w,
- int h, int64_t res[4][4], int plane) {
+ int h, int64_t res[4][8], int plane) {
int c, m, n, filtered = 0;
- int sum[4];
+ int sum[8];
const int subx = plane != AOM_PLANE_Y && rec->subsampling_x;
const int suby = plane != AOM_PLANE_Y && rec->subsampling_y;
int bslog = get_msb(block_size);
@@ -193,12 +196,12 @@
plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
int rec_stride = plane != AOM_PLANE_Y ? rec->uv_stride : rec->y_stride;
int org_stride = plane != AOM_PLANE_Y ? org->uv_stride : org->y_stride;
- sum[0] = sum[1] = sum[2] = sum[3] = 0;
+ sum[0] = sum[1] = sum[2] = sum[3] = sum[4] = sum[5] = sum[6] = sum[7] = 0;
if (plane == AOM_PLANE_Y &&
fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) {
int w1, h1, w2, h2, i, sum1, sum2, sum3, oldfiltered;
- fb_size_log2--;
+ filtered = fb_size_log2-- == MAX_FB_SIZE_LOG2;
w1 = AOMMIN(1 << (fb_size_log2 - bslog), w);
h1 = AOMMIN(1 << (fb_size_log2 - bslog), h);
w2 = AOMMIN(w - (1 << (fb_size_log2 - bslog)), w >> 1);
@@ -210,8 +213,8 @@
oldfiltered = res[i][0];
res[i][0] = 0;
- filtered = clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1,
- res, plane);
+ filtered |= clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1,
+ res, plane);
if (1 << (fb_size_log2 - bslog) < w)
filtered |= clpf_rdo(y, x + (1 << fb_size_log2), rec, org, cm, block_size,
fb_size_log2, w2, h1, res, plane);
@@ -223,10 +226,18 @@
cm, block_size, fb_size_log2, w2, h2, res, plane);
}
+ // Correct sums for unfiltered blocks
res[i][1] = AOMMIN(sum1 + res[i][0], res[i][1]);
res[i][2] = AOMMIN(sum2 + res[i][0], res[i][2]);
res[i][3] = AOMMIN(sum3 + res[i][0], res[i][3]);
+ if (i == 1) {
+ res[i][5] = AOMMIN(sum1 + res[i][4], res[i][5]);
+ res[i][6] = AOMMIN(sum2 + res[i][4], res[i][6]);
+ res[i][7] = AOMMIN(sum3 + res[i][4], res[i][7]);
+ }
+
res[i][0] = oldfiltered + filtered; // Number of signal bits
+
return filtered;
}
@@ -234,27 +245,28 @@
for (n = 0; n < w; n++) {
int xpos = x + n * block_size;
int ypos = y + m * block_size;
- if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
- (xpos << subx) / MI_SIZE]
- ->mbmi.skip) {
+ int skip = // Filtered skip blocks stored only for fb_size == 128
+ 4 *
+ !!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
+ (xpos << subx) / MI_SIZE]
+ ->mbmi.skip;
#if CONFIG_AOM_HIGHBITDEPTH
- if (cm->use_highbitdepth) {
- aom_clpf_detect_multi_hbd(
- CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer),
- rec_stride, org_stride, xpos, ypos, rec_width, rec_height, sum,
- cm->bit_depth - 8, block_size);
- } else {
- aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
- xpos, ypos, rec_width, rec_height, sum,
- block_size);
- }
-#else
+ if (cm->use_highbitdepth) {
+ aom_clpf_detect_multi_hbd(CONVERT_TO_SHORTPTR(rec_buffer),
+ CONVERT_TO_SHORTPTR(org_buffer), rec_stride,
+ org_stride, xpos, ypos, rec_width, rec_height,
+ sum + skip, cm->bit_depth - 8, block_size);
+ } else {
aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
- xpos, ypos, rec_width, rec_height, sum,
+ xpos, ypos, rec_width, rec_height, sum + skip,
block_size);
-#endif
- filtered = 1;
}
+#else
+ aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
+ xpos, ypos, rec_width, rec_height, sum + skip,
+ block_size);
+#endif
+ filtered |= !skip;
}
}
@@ -263,6 +275,12 @@
res[c][1] += sum[1];
res[c][2] += sum[2];
res[c][3] += sum[3];
+ if (c != 1) continue;
+ // Only needed when fb_size == 128
+ res[c][4] += sum[4];
+ res[c][5] += sum[5];
+ res[c][6] += sum[6];
+ res[c][7] += sum[7];
}
return filtered;
}
@@ -271,7 +289,7 @@
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
int *best_strength, int *best_bs, int plane) {
int c, j, k, l;
- int64_t best, sums[4][4];
+ int64_t best, sums[4][8];
int width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
int height = plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
const int bs = MI_SIZE;
@@ -303,8 +321,14 @@
}
}
- if (plane != AOM_PLANE_Y) // Slightly favour unfiltered chroma
+ // For fb_size == 128 skip blocks are included in the result.
+ if (plane == AOM_PLANE_Y) {
+ sums[1][1] += sums[1][5] - sums[1][4];
+ sums[1][2] += sums[1][6] - sums[1][4];
+ sums[1][3] += sums[1][7] - sums[1][4];
+ } else { // Slightly favour unfiltered chroma
sums[0][0] -= sums[0][0] >> 7;
+ }
for (j = 0; j < 4; j++) {
static const double lambda_square[] = {
diff --git a/av1/encoder/clpf_rdo.h b/av1/encoder/clpf_rdo.h
index bb85fbc..586eed0 100644
--- a/av1/encoder/clpf_rdo.h
+++ b/av1/encoder/clpf_rdo.h
@@ -17,7 +17,7 @@
int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
int block_size, int w, int h, unsigned int strength,
- unsigned int fb_size_log2, uint8_t *res);
+ unsigned int fb_size_log2, int8_t *res);
void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
diff --git a/av1/encoder/cost.c b/av1/encoder/cost.c
index 9a2ac8e..e3151a5 100644
--- a/av1/encoder/cost.c
+++ b/av1/encoder/cost.c
@@ -11,9 +11,6 @@
#include <assert.h>
#include "av1/encoder/cost.h"
-#if CONFIG_ANS
-#include "aom_dsp/ans.h"
-#endif // CONFIG_ANS
#include "av1/common/entropy.h"
/* round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT))
@@ -41,91 +38,6 @@
26, 23, 20, 18, 15, 12, 9, 6, 3
};
-#if CONFIG_ANS
-// round(-log2(i/1024.) * (1 << AV1_PROB_COST_SHIFT))
-static const uint16_t av1_prob_cost10[1024] = {
- 5120, 5120, 4608, 4308, 4096, 3931, 3796, 3683, 3584, 3497, 3419, 3349, 3284,
- 3225, 3171, 3120, 3072, 3027, 2985, 2945, 2907, 2871, 2837, 2804, 2772, 2742,
- 2713, 2685, 2659, 2633, 2608, 2583, 2560, 2537, 2515, 2494, 2473, 2453, 2433,
- 2414, 2395, 2377, 2359, 2342, 2325, 2308, 2292, 2276, 2260, 2245, 2230, 2216,
- 2201, 2187, 2173, 2160, 2147, 2134, 2121, 2108, 2096, 2083, 2071, 2060, 2048,
- 2037, 2025, 2014, 2003, 1992, 1982, 1971, 1961, 1951, 1941, 1931, 1921, 1911,
- 1902, 1892, 1883, 1874, 1865, 1856, 1847, 1838, 1830, 1821, 1813, 1804, 1796,
- 1788, 1780, 1772, 1764, 1756, 1748, 1741, 1733, 1726, 1718, 1711, 1704, 1697,
- 1689, 1682, 1675, 1668, 1661, 1655, 1648, 1641, 1635, 1628, 1622, 1615, 1609,
- 1602, 1596, 1590, 1584, 1578, 1571, 1565, 1559, 1554, 1548, 1542, 1536, 1530,
- 1525, 1519, 1513, 1508, 1502, 1497, 1491, 1486, 1480, 1475, 1470, 1465, 1459,
- 1454, 1449, 1444, 1439, 1434, 1429, 1424, 1419, 1414, 1409, 1404, 1399, 1395,
- 1390, 1385, 1380, 1376, 1371, 1367, 1362, 1357, 1353, 1348, 1344, 1340, 1335,
- 1331, 1326, 1322, 1318, 1313, 1309, 1305, 1301, 1297, 1292, 1288, 1284, 1280,
- 1276, 1272, 1268, 1264, 1260, 1256, 1252, 1248, 1244, 1240, 1236, 1233, 1229,
- 1225, 1221, 1218, 1214, 1210, 1206, 1203, 1199, 1195, 1192, 1188, 1185, 1181,
- 1177, 1174, 1170, 1167, 1163, 1160, 1156, 1153, 1149, 1146, 1143, 1139, 1136,
- 1133, 1129, 1126, 1123, 1119, 1116, 1113, 1110, 1106, 1103, 1100, 1097, 1094,
- 1090, 1087, 1084, 1081, 1078, 1075, 1072, 1069, 1066, 1062, 1059, 1056, 1053,
- 1050, 1047, 1044, 1042, 1039, 1036, 1033, 1030, 1027, 1024, 1021, 1018, 1015,
- 1013, 1010, 1007, 1004, 1001, 998, 996, 993, 990, 987, 985, 982, 979,
- 977, 974, 971, 968, 966, 963, 960, 958, 955, 953, 950, 947, 945,
- 942, 940, 937, 934, 932, 929, 927, 924, 922, 919, 917, 914, 912,
- 909, 907, 904, 902, 899, 897, 895, 892, 890, 887, 885, 883, 880,
- 878, 876, 873, 871, 868, 866, 864, 861, 859, 857, 855, 852, 850,
- 848, 845, 843, 841, 839, 836, 834, 832, 830, 828, 825, 823, 821,
- 819, 817, 814, 812, 810, 808, 806, 804, 801, 799, 797, 795, 793,
- 791, 789, 787, 785, 783, 780, 778, 776, 774, 772, 770, 768, 766,
- 764, 762, 760, 758, 756, 754, 752, 750, 748, 746, 744, 742, 740,
- 738, 736, 734, 732, 730, 728, 726, 724, 723, 721, 719, 717, 715,
- 713, 711, 709, 707, 706, 704, 702, 700, 698, 696, 694, 693, 691,
- 689, 687, 685, 683, 682, 680, 678, 676, 674, 673, 671, 669, 667,
- 665, 664, 662, 660, 658, 657, 655, 653, 651, 650, 648, 646, 644,
- 643, 641, 639, 637, 636, 634, 632, 631, 629, 627, 626, 624, 622,
- 621, 619, 617, 616, 614, 612, 611, 609, 607, 606, 604, 602, 601,
- 599, 598, 596, 594, 593, 591, 590, 588, 586, 585, 583, 582, 580,
- 578, 577, 575, 574, 572, 571, 569, 567, 566, 564, 563, 561, 560,
- 558, 557, 555, 554, 552, 550, 549, 547, 546, 544, 543, 541, 540,
- 538, 537, 535, 534, 532, 531, 530, 528, 527, 525, 524, 522, 521,
- 519, 518, 516, 515, 513, 512, 511, 509, 508, 506, 505, 503, 502,
- 501, 499, 498, 496, 495, 493, 492, 491, 489, 488, 486, 485, 484,
- 482, 481, 480, 478, 477, 475, 474, 473, 471, 470, 469, 467, 466,
- 465, 463, 462, 460, 459, 458, 456, 455, 454, 452, 451, 450, 448,
- 447, 446, 444, 443, 442, 441, 439, 438, 437, 435, 434, 433, 431,
- 430, 429, 428, 426, 425, 424, 422, 421, 420, 419, 417, 416, 415,
- 414, 412, 411, 410, 409, 407, 406, 405, 404, 402, 401, 400, 399,
- 397, 396, 395, 394, 392, 391, 390, 389, 387, 386, 385, 384, 383,
- 381, 380, 379, 378, 377, 375, 374, 373, 372, 371, 369, 368, 367,
- 366, 365, 364, 362, 361, 360, 359, 358, 356, 355, 354, 353, 352,
- 351, 349, 348, 347, 346, 345, 344, 343, 341, 340, 339, 338, 337,
- 336, 335, 333, 332, 331, 330, 329, 328, 327, 326, 324, 323, 322,
- 321, 320, 319, 318, 317, 316, 314, 313, 312, 311, 310, 309, 308,
- 307, 306, 305, 303, 302, 301, 300, 299, 298, 297, 296, 295, 294,
- 293, 292, 291, 289, 288, 287, 286, 285, 284, 283, 282, 281, 280,
- 279, 278, 277, 276, 275, 274, 273, 272, 271, 269, 268, 267, 266,
- 265, 264, 263, 262, 261, 260, 259, 258, 257, 256, 255, 254, 253,
- 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240,
- 239, 238, 237, 236, 235, 234, 233, 232, 231, 230, 229, 228, 227,
- 226, 225, 224, 223, 222, 221, 220, 219, 218, 217, 216, 215, 214,
- 213, 212, 212, 211, 210, 209, 208, 207, 206, 205, 204, 203, 202,
- 201, 200, 199, 198, 197, 196, 195, 194, 194, 193, 192, 191, 190,
- 189, 188, 187, 186, 185, 184, 183, 182, 181, 181, 180, 179, 178,
- 177, 176, 175, 174, 173, 172, 171, 170, 170, 169, 168, 167, 166,
- 165, 164, 163, 162, 161, 161, 160, 159, 158, 157, 156, 155, 154,
- 153, 152, 152, 151, 150, 149, 148, 147, 146, 145, 145, 144, 143,
- 142, 141, 140, 139, 138, 138, 137, 136, 135, 134, 133, 132, 132,
- 131, 130, 129, 128, 127, 126, 125, 125, 124, 123, 122, 121, 120,
- 120, 119, 118, 117, 116, 115, 114, 114, 113, 112, 111, 110, 109,
- 109, 108, 107, 106, 105, 104, 104, 103, 102, 101, 100, 99, 99,
- 98, 97, 96, 95, 95, 94, 93, 92, 91, 90, 90, 89, 88,
- 87, 86, 86, 85, 84, 83, 82, 82, 81, 80, 79, 78, 78,
- 77, 76, 75, 74, 74, 73, 72, 71, 70, 70, 69, 68, 67,
- 66, 66, 65, 64, 63, 62, 62, 61, 60, 59, 59, 58, 57,
- 56, 55, 55, 54, 53, 52, 52, 51, 50, 49, 48, 48, 47,
- 46, 45, 45, 44, 43, 42, 42, 41, 40, 39, 38, 38, 37,
- 36, 35, 35, 34, 33, 32, 32, 31, 30, 29, 29, 28, 27,
- 26, 26, 25, 24, 23, 23, 22, 21, 20, 20, 19, 18, 18,
- 17, 16, 15, 15, 14, 13, 12, 12, 11, 10, 9, 9, 8,
- 7, 7, 6, 5, 4, 4, 3, 2, 1, 1
-};
-#endif // CONFIG_ANS
-
static void cost(int *costs, aom_tree tree, const aom_prob *probs, int i,
int c) {
const aom_prob prob = probs[i / 2];
@@ -143,20 +55,6 @@
}
}
-#if CONFIG_ANS
-void av1_cost_tokens_ans(int *costs, const aom_prob *tree_probs,
- const rans_lut token_cdf, int skip_eob) {
- int c_tree = 0; // Cost of the "tree" nodes EOB and ZERO.
- int i;
- costs[EOB_TOKEN] = av1_cost_bit(tree_probs[0], 0);
- if (!skip_eob) c_tree = av1_cost_bit(tree_probs[0], 1);
- for (i = ZERO_TOKEN; i <= CATEGORY6_TOKEN; ++i) {
- const int p = token_cdf[i + 1] - token_cdf[i];
- costs[i] = c_tree + av1_prob_cost10[p];
- }
-}
-#endif // CONFIG_ANS
-
void av1_cost_tokens(int *costs, const aom_prob *probs, aom_tree tree) {
cost(costs, tree, probs, 0, 0);
}
diff --git a/av1/encoder/cost.h b/av1/encoder/cost.h
index 448b905..379200e 100644
--- a/av1/encoder/cost.h
+++ b/av1/encoder/cost.h
@@ -13,9 +13,6 @@
#include "aom_dsp/prob.h"
#include "aom/aom_integer.h"
-#if CONFIG_ANS
-#include "aom_dsp/ans.h"
-#endif // CONFIG_ANS
#ifdef __cplusplus
extern "C" {
@@ -58,11 +55,6 @@
void av1_cost_tokens(int *costs, const aom_prob *probs, aom_tree tree);
void av1_cost_tokens_skip(int *costs, const aom_prob *probs, aom_tree tree);
-#if CONFIG_ANS
-void av1_cost_tokens_ans(int *costs, const aom_prob *tree_probs,
- const rans_lut token_cdf, int skip_eob);
-#endif
-
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 163f4c0..2d9a892 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -61,8 +61,9 @@
#endif // CONFIG_AOM_HIGHBITDEPTH
static void encode_superblock(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
- int output_enabled, int mi_row, int mi_col,
- BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
+ RUN_TYPE dry_run, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ int *rate);
#if CONFIG_SUPERTX
static int check_intra_b(PICK_MODE_CONTEXT *ctx);
@@ -80,13 +81,13 @@
static void predict_sb_complex(AV1_COMP *cpi, ThreadData *td,
const TileInfo *const tile, int mi_row,
int mi_col, int mi_row_ori, int mi_col_ori,
- int output_enabled, BLOCK_SIZE bsize,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize,
BLOCK_SIZE top_bsize, uint8_t *dst_buf[3],
int dst_stride[3], PC_TREE *pc_tree);
static void update_state_sb_supertx(AV1_COMP *cpi, ThreadData *td,
const TileInfo *const tile, int mi_row,
int mi_col, BLOCK_SIZE bsize,
- int output_enabled, PC_TREE *pc_tree);
+ RUN_TYPE dry_run, PC_TREE *pc_tree);
static void rd_supertx_sb(AV1_COMP *cpi, ThreadData *td,
const TileInfo *const tile, int mi_row, int mi_col,
BLOCK_SIZE bsize, int *tmp_rate, int64_t *tmp_dist,
@@ -1012,7 +1013,7 @@
static void update_state(AV1_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx,
int mi_row, int mi_col, BLOCK_SIZE bsize,
- int output_enabled) {
+ RUN_TYPE dry_run) {
int i, x_idx, y;
AV1_COMMON *const cm = &cpi->common;
RD_COUNTS *const rdc = &td->rd_counts;
@@ -1126,7 +1127,7 @@
sizeof(uint8_t) * ctx->num_4x4_blk);
#endif
- if (!output_enabled) return;
+ if (dry_run) return;
#if CONFIG_INTERNAL_STATS
if (frame_is_intra_only(cm)) {
@@ -1195,7 +1196,7 @@
#if CONFIG_SUPERTX
static void update_state_supertx(AV1_COMP *cpi, ThreadData *td,
PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
- BLOCK_SIZE bsize, int output_enabled) {
+ BLOCK_SIZE bsize, RUN_TYPE dry_run) {
int y, x_idx;
#if CONFIG_VAR_TX || CONFIG_REF_MV
int i;
@@ -1303,7 +1304,7 @@
// Turn motion variation off for supertx
mbmi->motion_mode = SIMPLE_TRANSLATION;
- if (!output_enabled) return;
+ if (dry_run) return;
if (!frame_is_intra_only(cm)) {
av1_update_mv_count(td);
@@ -1341,7 +1342,7 @@
static void update_state_sb_supertx(AV1_COMP *cpi, ThreadData *td,
const TileInfo *const tile, int mi_row,
int mi_col, BLOCK_SIZE bsize,
- int output_enabled, PC_TREE *pc_tree) {
+ RUN_TYPE dry_run, PC_TREE *pc_tree) {
AV1_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -1365,27 +1366,27 @@
case PARTITION_NONE:
set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
update_state_supertx(cpi, td, &pc_tree->none, mi_row, mi_col, subsize,
- output_enabled);
+ dry_run);
break;
case PARTITION_VERT:
set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
update_state_supertx(cpi, td, &pc_tree->vertical[0], mi_row, mi_col,
- subsize, output_enabled);
+ subsize, dry_run);
if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
update_state_supertx(cpi, td, &pc_tree->vertical[1], mi_row,
- mi_col + hbs, subsize, output_enabled);
+ mi_col + hbs, subsize, dry_run);
}
pmc = &pc_tree->vertical_supertx;
break;
case PARTITION_HORZ:
set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
update_state_supertx(cpi, td, &pc_tree->horizontal[0], mi_row, mi_col,
- subsize, output_enabled);
+ subsize, dry_run);
if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
update_state_supertx(cpi, td, &pc_tree->horizontal[1], mi_row + hbs,
- mi_col, subsize, output_enabled);
+ mi_col, subsize, dry_run);
}
pmc = &pc_tree->horizontal_supertx;
break;
@@ -1393,20 +1394,20 @@
if (bsize == BLOCK_8X8) {
set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
update_state_supertx(cpi, td, pc_tree->leaf_split[0], mi_row, mi_col,
- subsize, output_enabled);
+ subsize, dry_run);
} else {
set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
- update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, subsize,
- output_enabled, pc_tree->split[0]);
+ update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, subsize, dry_run,
+ pc_tree->split[0]);
set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
update_state_sb_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize,
- output_enabled, pc_tree->split[1]);
+ dry_run, pc_tree->split[1]);
set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize,
- output_enabled, pc_tree->split[2]);
+ dry_run, pc_tree->split[2]);
set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, subsize);
update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs,
- subsize, output_enabled, pc_tree->split[3]);
+ subsize, dry_run, pc_tree->split[3]);
}
pmc = &pc_tree->split_supertx;
break;
@@ -1414,49 +1415,49 @@
case PARTITION_HORZ_A:
set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
update_state_supertx(cpi, td, &pc_tree->horizontala[0], mi_row, mi_col,
- bsize2, output_enabled);
+ bsize2, dry_run);
set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
update_state_supertx(cpi, td, &pc_tree->horizontala[1], mi_row,
- mi_col + hbs, bsize2, output_enabled);
+ mi_col + hbs, bsize2, dry_run);
set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
update_state_supertx(cpi, td, &pc_tree->horizontala[2], mi_row + hbs,
- mi_col, subsize, output_enabled);
+ mi_col, subsize, dry_run);
pmc = &pc_tree->horizontala_supertx;
break;
case PARTITION_HORZ_B:
set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
update_state_supertx(cpi, td, &pc_tree->horizontalb[0], mi_row, mi_col,
- subsize, output_enabled);
+ subsize, dry_run);
set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
update_state_supertx(cpi, td, &pc_tree->horizontalb[1], mi_row + hbs,
- mi_col, bsize2, output_enabled);
+ mi_col, bsize2, dry_run);
set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
update_state_supertx(cpi, td, &pc_tree->horizontalb[2], mi_row + hbs,
- mi_col + hbs, bsize2, output_enabled);
+ mi_col + hbs, bsize2, dry_run);
pmc = &pc_tree->horizontalb_supertx;
break;
case PARTITION_VERT_A:
set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
update_state_supertx(cpi, td, &pc_tree->verticala[0], mi_row, mi_col,
- bsize2, output_enabled);
+ bsize2, dry_run);
set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
update_state_supertx(cpi, td, &pc_tree->verticala[1], mi_row + hbs,
- mi_col, bsize2, output_enabled);
+ mi_col, bsize2, dry_run);
set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
update_state_supertx(cpi, td, &pc_tree->verticala[2], mi_row,
- mi_col + hbs, subsize, output_enabled);
+ mi_col + hbs, subsize, dry_run);
pmc = &pc_tree->verticala_supertx;
break;
case PARTITION_VERT_B:
set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
update_state_supertx(cpi, td, &pc_tree->verticalb[0], mi_row, mi_col,
- subsize, output_enabled);
+ subsize, dry_run);
set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
update_state_supertx(cpi, td, &pc_tree->verticalb[1], mi_row,
- mi_col + hbs, bsize2, output_enabled);
+ mi_col + hbs, bsize2, dry_run);
set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
update_state_supertx(cpi, td, &pc_tree->verticalb[2], mi_row + hbs,
- mi_col + hbs, bsize2, output_enabled);
+ mi_col + hbs, bsize2, dry_run);
pmc = &pc_tree->verticalb_supertx;
break;
#endif // CONFIG_EXT_PARTITION_TYPES
@@ -2096,21 +2097,21 @@
}
static void encode_b(AV1_COMP *cpi, const TileInfo *const tile, ThreadData *td,
- TOKENEXTRA **tp, int mi_row, int mi_col,
- int output_enabled, BLOCK_SIZE bsize,
+ TOKENEXTRA **tp, int mi_row, int mi_col, RUN_TYPE dry_run,
+ BLOCK_SIZE bsize,
#if CONFIG_EXT_PARTITION_TYPES
PARTITION_TYPE partition,
#endif
- PICK_MODE_CONTEXT *ctx) {
+ PICK_MODE_CONTEXT *ctx, int *rate) {
MACROBLOCK *const x = &td->mb;
set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
#if CONFIG_EXT_PARTITION_TYPES
x->e_mbd.mi[0]->mbmi.partition = partition;
#endif
- update_state(cpi, td, ctx, mi_row, mi_col, bsize, output_enabled);
- encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
+ update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
+ encode_superblock(cpi, td, tp, dry_run, mi_row, mi_col, bsize, ctx, rate);
- if (output_enabled) {
+ if (!dry_run) {
#if CONFIG_SUPERTX
update_stats(&cpi->common, td, 0);
#else
@@ -2120,8 +2121,8 @@
}
static void encode_sb(AV1_COMP *cpi, ThreadData *td, const TileInfo *const tile,
- TOKENEXTRA **tp, int mi_row, int mi_col,
- int output_enabled, BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+ TOKENEXTRA **tp, int mi_row, int mi_col, RUN_TYPE dry_run,
+ BLOCK_SIZE bsize, PC_TREE *pc_tree, int *rate) {
const AV1_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -2138,7 +2139,7 @@
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
- if (output_enabled) td->counts->partition[ctx][partition]++;
+ if (!dry_run) td->counts->partition[ctx][partition]++;
#if CONFIG_SUPERTX
if (!frame_is_intra_only(cm) && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
@@ -2154,33 +2155,34 @@
int dst_stride[3];
set_skip_context(xd, mi_row, mi_col);
set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
- update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize,
- output_enabled, pc_tree);
+ update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, dry_run,
+ pc_tree);
av1_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
for (i = 0; i < MAX_MB_PLANE; i++) {
dst_buf[i] = xd->plane[i].dst.buf;
dst_stride[i] = xd->plane[i].dst.stride;
}
- predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col,
- output_enabled, bsize, bsize, dst_buf, dst_stride,
- pc_tree);
+ predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, dry_run,
+ bsize, bsize, dst_buf, dst_stride, pc_tree);
set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize);
if (!x->skip) {
+ int this_rate = 0;
x->skip_optimize = 0;
x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
av1_encode_sb_supertx(x, bsize);
- av1_tokenize_sb_supertx(cpi, td, tp, !output_enabled, bsize);
+ av1_tokenize_sb_supertx(cpi, td, tp, dry_run, bsize, rate);
+ if (rate) *rate += this_rate;
} else {
xd->mi[0]->mbmi.skip = 1;
- if (output_enabled) td->counts->skip[av1_get_skip_context(xd)][1]++;
+ if (!dry_run) td->counts->skip[av1_get_skip_context(xd)][1]++;
reset_skip_context(xd, bsize);
}
- if (output_enabled) {
+ if (!dry_run) {
for (y_idx = 0; y_idx < mi_height; y_idx++)
for (x_idx = 0; x_idx < mi_width; x_idx++) {
if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width >
@@ -2221,7 +2223,7 @@
#endif // CONFIG_VAR_TX
return;
} else {
- if (output_enabled) {
+ if (!dry_run) {
td->counts->supertx[partition_supertx_context_lookup[partition]]
[supertx_size][0]++;
}
@@ -2231,93 +2233,91 @@
switch (partition) {
case PARTITION_NONE:
- encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
#if CONFIG_EXT_PARTITION_TYPES
partition,
#endif
- &pc_tree->none);
+ &pc_tree->none, rate);
break;
case PARTITION_VERT:
- encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
#if CONFIG_EXT_PARTITION_TYPES
partition,
#endif
- &pc_tree->vertical[0]);
+ &pc_tree->vertical[0], rate);
if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
- encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled,
- subsize,
+ encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
#if CONFIG_EXT_PARTITION_TYPES
partition,
#endif
- &pc_tree->vertical[1]);
+ &pc_tree->vertical[1], rate);
}
break;
case PARTITION_HORZ:
- encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
#if CONFIG_EXT_PARTITION_TYPES
partition,
#endif
- &pc_tree->horizontal[0]);
+ &pc_tree->horizontal[0], rate);
if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
- encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled,
- subsize,
+ encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
#if CONFIG_EXT_PARTITION_TYPES
partition,
#endif
- &pc_tree->horizontal[1]);
+ &pc_tree->horizontal[1], rate);
}
break;
case PARTITION_SPLIT:
if (bsize == BLOCK_8X8) {
- encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
#if CONFIG_EXT_PARTITION_TYPES
partition,
#endif
- pc_tree->leaf_split[0]);
+ pc_tree->leaf_split[0], rate);
} else {
- encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
- pc_tree->split[0]);
- encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
- subsize, pc_tree->split[1]);
- encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
- subsize, pc_tree->split[2]);
- encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
- subsize, pc_tree->split[3]);
+ encode_sb(cpi, td, tile, tp, mi_row, mi_col, dry_run, subsize,
+ pc_tree->split[0], rate);
+ encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, dry_run, subsize,
+ pc_tree->split[1], rate);
+ encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, dry_run, subsize,
+ pc_tree->split[2], rate);
+ encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, dry_run,
+ subsize, pc_tree->split[3], rate);
}
break;
#if CONFIG_EXT_PARTITION_TYPES
case PARTITION_HORZ_A:
- encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, bsize2,
- partition, &pc_tree->horizontala[0]);
- encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled, bsize2,
- partition, &pc_tree->horizontala[1]);
- encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled, subsize,
- partition, &pc_tree->horizontala[2]);
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition,
+ &pc_tree->horizontala[0], rate);
+ encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+ partition, &pc_tree->horizontala[1], rate);
+ encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+ partition, &pc_tree->horizontala[2], rate);
break;
case PARTITION_HORZ_B:
- encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
- partition, &pc_tree->horizontalb[0]);
- encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled, bsize2,
- partition, &pc_tree->horizontalb[1]);
- encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, output_enabled,
- bsize2, partition, &pc_tree->horizontalb[2]);
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
+ &pc_tree->horizontalb[0], rate);
+ encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+ partition, &pc_tree->horizontalb[1], rate);
+ encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
+ partition, &pc_tree->horizontalb[2], rate);
break;
case PARTITION_VERT_A:
- encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, bsize2,
- partition, &pc_tree->verticala[0]);
- encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled, bsize2,
- partition, &pc_tree->verticala[1]);
- encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled, subsize,
- partition, &pc_tree->verticala[2]);
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition,
+ &pc_tree->verticala[0], rate);
+ encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+ partition, &pc_tree->verticala[1], rate);
+ encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+ partition, &pc_tree->verticala[2], rate);
break;
case PARTITION_VERT_B:
- encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
- partition, &pc_tree->verticalb[0]);
- encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled, bsize2,
- partition, &pc_tree->verticalb[1]);
- encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, output_enabled,
- bsize2, partition, &pc_tree->verticalb[2]);
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
+ &pc_tree->verticalb[0], rate);
+ encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+ partition, &pc_tree->verticalb[1], rate);
+ encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
+ partition, &pc_tree->verticalb[2], rate);
break;
#endif // CONFIG_EXT_PARTITION_TYPES
default: assert(0 && "Invalid partition type."); break;
@@ -2532,8 +2532,9 @@
#endif
PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
av1_rd_cost_init(&tmp_rdc);
- update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
- encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+ update_state(cpi, td, ctx, mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+ ctx, NULL);
rd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
#if CONFIG_SUPERTX
&rt_nocoef,
@@ -2574,8 +2575,9 @@
#endif
PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0];
av1_rd_cost_init(&tmp_rdc);
- update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
- encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+ update_state(cpi, td, ctx, mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+ ctx, NULL);
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
#if CONFIG_SUPERTX
&rt_nocoef,
@@ -2728,8 +2730,8 @@
#endif
if (i != 3)
- encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx, 0,
- split_subsize, pc_tree->split[i]);
+ encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx,
+ OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
#if CONFIG_SUPERTX
@@ -2772,9 +2774,17 @@
assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
if (do_recon) {
- int output_enabled = (bsize == cm->sb_size);
- encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
- pc_tree);
+ if (bsize == cm->sb_size) {
+ // NOTE: To get estimate for rate due to the tokens, use:
+ // int rate_coeffs = 0;
+ // encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+ // bsize, pc_tree, &rate_coeffs);
+ encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+ pc_tree, NULL);
+ } else {
+ encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+ pc_tree, NULL);
+ }
}
*rate = chosen_rdc.rate;
@@ -3123,8 +3133,9 @@
if (sum_rdc.rdcost < best_rdc->rdcost) {
#endif
PICK_MODE_CONTEXT *ctx = &ctxs[0];
- update_state(cpi, td, ctx, mi_row0, mi_col0, subsize0, 0);
- encode_superblock(cpi, td, tp, 0, mi_row0, mi_col0, subsize0, ctx);
+ update_state(cpi, td, ctx, mi_row0, mi_col0, subsize0, 1);
+ encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row0, mi_col0, subsize0,
+ ctx, NULL);
if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
@@ -3163,8 +3174,9 @@
if (sum_rdc.rdcost < best_rdc->rdcost) {
#endif
PICK_MODE_CONTEXT *ctx = &ctxs[1];
- update_state(cpi, td, ctx, mi_row1, mi_col1, subsize1, 0);
- encode_superblock(cpi, td, tp, 0, mi_row1, mi_col1, subsize1, ctx);
+ update_state(cpi, td, ctx, mi_row1, mi_col1, subsize1, 1);
+ encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row1, mi_col1, subsize1,
+ ctx, NULL);
if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
@@ -3775,8 +3787,9 @@
#endif // CONFIG_SUPERTX
mi_row + mi_step < cm->mi_rows && bsize > BLOCK_8X8) {
PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
- update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
- encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+ update_state(cpi, td, ctx, mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+ ctx, NULL);
if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
@@ -3911,9 +3924,9 @@
if (sum_rdc.rdcost < best_rdc.rdcost &&
#endif // CONFIG_SUPERTX
mi_col + mi_step < cm->mi_cols && bsize > BLOCK_8X8) {
- update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
- encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize,
- &pc_tree->vertical[0]);
+ update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+ &pc_tree->vertical[0], NULL);
if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
@@ -4086,9 +4099,13 @@
if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
pc_tree->index != 3) {
- int output_enabled = (bsize == cm->sb_size);
- encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
- pc_tree);
+ if (bsize == cm->sb_size) {
+ encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+ pc_tree, NULL);
+ } else {
+ encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+ pc_tree, NULL);
+ }
}
if (bsize == cm->sb_size) {
@@ -4226,11 +4243,7 @@
cpi->td.rd_counts.coef_counts);
av1_copy(subframe_stats->eob_counts_buf[cm->coef_probs_update_idx],
cm->counts.eob_branch);
- av1_fill_token_costs(x->token_costs,
-#if CONFIG_ANS
- cm->fc->coef_cdfs,
-#endif // CONFIG_ANS
- cm->fc->coef_probs);
+ av1_fill_token_costs(x->token_costs, cm->fc->coef_probs);
}
}
#endif // CONFIG_ENTROPY
@@ -4987,8 +5000,9 @@
#endif
static void encode_superblock(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
- int output_enabled, int mi_row, int mi_col,
- BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+ RUN_TYPE dry_run, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ int *rate) {
AV1_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -5010,12 +5024,12 @@
mbmi->skip = 1;
for (plane = 0; plane < MAX_MB_PLANE; ++plane)
av1_encode_intra_block_plane(x, AOMMAX(bsize, BLOCK_8X8), plane, 1);
- if (output_enabled)
+ if (!dry_run)
sum_intra_stats(td->counts, mi, xd->above_mi, xd->left_mi,
frame_is_intra_only(cm));
#if CONFIG_EXT_INTRA
- if (output_enabled && bsize >= BLOCK_8X8) {
+ if (!dry_run && bsize >= BLOCK_8X8) {
FRAME_COUNTS *counts = td->counts;
if (mbmi->mode == DC_PRED && mbmi->palette_mode_info.palette_size[0] == 0)
++counts->ext_intra[0][mbmi->ext_intra_mode_info.use_ext_intra_mode[0]];
@@ -5033,18 +5047,18 @@
}
#endif // CONFIG_EXT_INTRA
- if (bsize >= BLOCK_8X8 && output_enabled) {
+ if (bsize >= BLOCK_8X8 && !dry_run) {
for (plane = 0; plane <= 1; ++plane) {
if (mbmi->palette_mode_info.palette_size[plane] > 0) {
mbmi->palette_mode_info.palette_first_color_idx[plane] =
xd->plane[plane].color_index_map[0];
// TODO(huisu): this increases the use of token buffer. Needs stretch
// test to verify.
- av1_tokenize_palette_sb(td, bsize, plane, t);
+ av1_tokenize_palette_sb(cpi, td, plane, t, dry_run, bsize, rate);
}
}
}
- av1_tokenize_sb(cpi, td, t, !output_enabled, AOMMAX(bsize, BLOCK_8X8));
+ av1_tokenize_sb(cpi, td, t, dry_run, AOMMAX(bsize, BLOCK_8X8), rate);
} else {
int ref;
const int is_compound = has_second_ref(mbmi);
@@ -5116,17 +5130,17 @@
#if CONFIG_VAR_TX
#if CONFIG_EXT_TX && CONFIG_RECT_TX
if (is_rect_tx(mbmi->tx_size))
- av1_tokenize_sb(cpi, td, t, !output_enabled, AOMMAX(bsize, BLOCK_8X8));
+ av1_tokenize_sb(cpi, td, t, dry_run, AOMMAX(bsize, BLOCK_8X8), rate);
else
#endif
- av1_tokenize_sb_vartx(cpi, td, t, !output_enabled, mi_row, mi_col,
- AOMMAX(bsize, BLOCK_8X8));
+ av1_tokenize_sb_vartx(cpi, td, t, dry_run, mi_row, mi_col,
+ AOMMAX(bsize, BLOCK_8X8), rate);
#else
- av1_tokenize_sb(cpi, td, t, !output_enabled, AOMMAX(bsize, BLOCK_8X8));
+ av1_tokenize_sb(cpi, td, t, dry_run, AOMMAX(bsize, BLOCK_8X8), rate);
#endif
}
- if (output_enabled) {
+ if (!dry_run) {
if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 &&
!(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) {
const int is_inter = is_inter_block(mbmi);
@@ -5209,8 +5223,7 @@
#if CONFIG_VAR_TX
if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 &&
is_inter_block(mbmi) && !(mbmi->skip || seg_skip)) {
- if (!output_enabled)
- tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
+ if (dry_run) tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
#if CONFIG_EXT_TX && CONFIG_RECT_TX
if (is_rect_tx(mbmi->tx_size)) {
set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, xd);
@@ -5406,7 +5419,7 @@
int mi_col_pred, int mi_row_top, int mi_col_top,
uint8_t *dst_buf[3], int dst_stride[3],
BLOCK_SIZE bsize_top, BLOCK_SIZE bsize_pred,
- int output_enabled, int b_sub8x8, int bextend) {
+ RUN_TYPE dry_run, int b_sub8x8, int bextend) {
// Used in supertx
// (mi_row_ori, mi_col_ori): location for mv
// (mi_row_pred, mi_col_pred, bsize_pred): region to predict
@@ -5450,13 +5463,13 @@
#endif // CONFIG_EXT_INTER
mi_row_pred, mi_col_pred, bsize_pred, b_sub8x8, block);
- if (output_enabled && !bextend) update_stats(&cpi->common, td, 1);
+ if (!dry_run && !bextend) update_stats(&cpi->common, td, 1);
}
static void extend_dir(AV1_COMP *cpi, ThreadData *td,
const TileInfo *const tile, int block, BLOCK_SIZE bsize,
BLOCK_SIZE top_bsize, int mi_row, int mi_col,
- int mi_row_top, int mi_col_top, int output_enabled,
+ int mi_row_top, int mi_col_top, RUN_TYPE dry_run,
uint8_t *dst_buf[3], int dst_stride[3], int dir) {
// dir: 0-lower, 1-upper, 2-left, 3-right
// 4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
@@ -5480,7 +5493,7 @@
predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
mi_col_pred, mi_row_top, mi_col_top, dst_buf, dst_stride,
- top_bsize, extend_bsize, output_enabled, b_sub8x8, 1);
+ top_bsize, extend_bsize, dry_run, b_sub8x8, 1);
if (mi_width > unit) {
int i;
@@ -5488,8 +5501,8 @@
mi_col_pred += unit;
predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
mi_col_pred, mi_row_top, mi_col_top, dst_buf,
- dst_stride, top_bsize, extend_bsize, output_enabled,
- b_sub8x8, 1);
+ dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
+ 1);
}
}
} else if (dir == 2 || dir == 3) { // left and right
@@ -5502,7 +5515,7 @@
predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
mi_col_pred, mi_row_top, mi_col_top, dst_buf, dst_stride,
- top_bsize, extend_bsize, output_enabled, b_sub8x8, 1);
+ top_bsize, extend_bsize, dry_run, b_sub8x8, 1);
if (mi_height > unit) {
int i;
@@ -5510,8 +5523,8 @@
mi_row_pred += unit;
predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
mi_col_pred, mi_row_top, mi_col_top, dst_buf,
- dst_stride, top_bsize, extend_bsize, output_enabled,
- b_sub8x8, 1);
+ dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
+ 1);
}
}
} else {
@@ -5521,32 +5534,32 @@
predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
mi_col_pred, mi_row_top, mi_col_top, dst_buf, dst_stride,
- top_bsize, extend_bsize, output_enabled, b_sub8x8, 1);
+ top_bsize, extend_bsize, dry_run, b_sub8x8, 1);
}
}
static void extend_all(AV1_COMP *cpi, ThreadData *td,
const TileInfo *const tile, int block, BLOCK_SIZE bsize,
BLOCK_SIZE top_bsize, int mi_row, int mi_col,
- int mi_row_top, int mi_col_top, int output_enabled,
+ int mi_row_top, int mi_col_top, RUN_TYPE dry_run,
uint8_t *dst_buf[3], int dst_stride[3]) {
assert(block >= 0 && block < 4);
extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
- mi_col_top, output_enabled, dst_buf, dst_stride, 0);
+ mi_col_top, dry_run, dst_buf, dst_stride, 0);
extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
- mi_col_top, output_enabled, dst_buf, dst_stride, 1);
+ mi_col_top, dry_run, dst_buf, dst_stride, 1);
extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
- mi_col_top, output_enabled, dst_buf, dst_stride, 2);
+ mi_col_top, dry_run, dst_buf, dst_stride, 2);
extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
- mi_col_top, output_enabled, dst_buf, dst_stride, 3);
+ mi_col_top, dry_run, dst_buf, dst_stride, 3);
extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
- mi_col_top, output_enabled, dst_buf, dst_stride, 4);
+ mi_col_top, dry_run, dst_buf, dst_stride, 4);
extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
- mi_col_top, output_enabled, dst_buf, dst_stride, 5);
+ mi_col_top, dry_run, dst_buf, dst_stride, 5);
extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
- mi_col_top, output_enabled, dst_buf, dst_stride, 6);
+ mi_col_top, dry_run, dst_buf, dst_stride, 6);
extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
- mi_col_top, output_enabled, dst_buf, dst_stride, 7);
+ mi_col_top, dry_run, dst_buf, dst_stride, 7);
}
// This function generates prediction for multiple blocks, between which
@@ -5560,7 +5573,7 @@
static void predict_sb_complex(AV1_COMP *cpi, ThreadData *td,
const TileInfo *const tile, int mi_row,
int mi_col, int mi_row_top, int mi_col_top,
- int output_enabled, BLOCK_SIZE bsize,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize,
BLOCK_SIZE top_bsize, uint8_t *dst_buf[3],
int dst_stride[3], PC_TREE *pc_tree) {
AV1_COMMON *const cm = &cpi->common;
@@ -5615,8 +5628,7 @@
}
#endif // CONFIG_AOM_HIGHBITDEPTH
- if (output_enabled && bsize < top_bsize)
- cm->counts.partition[ctx][partition]++;
+ if (!dry_run && bsize < top_bsize) cm->counts.partition[ctx][partition]++;
for (i = 0; i < MAX_MB_PLANE; i++) {
xd->plane[i].dst.buf = dst_buf[i];
@@ -5628,29 +5640,27 @@
assert(bsize < top_bsize);
predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
- bsize, output_enabled, 0, 0);
+ bsize, dry_run, 0, 0);
extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row, mi_col, mi_row_top,
- mi_col_top, output_enabled, dst_buf, dst_stride);
+ mi_col_top, dry_run, dst_buf, dst_stride);
break;
case PARTITION_HORZ:
if (bsize == BLOCK_8X8) {
// Fisrt half
predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
- BLOCK_8X8, output_enabled, 1, 0);
+ BLOCK_8X8, dry_run, 1, 0);
if (bsize < top_bsize)
extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf,
- dst_stride);
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
// Second half
predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
mi_row_top, mi_col_top, dst_buf1, dst_stride1,
- top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+ top_bsize, BLOCK_8X8, dry_run, 1, 1);
if (bsize < top_bsize)
extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf1,
- dst_stride1);
+ mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
// Smooth
xd->plane[0].dst.buf = dst_buf[0];
@@ -5663,29 +5673,26 @@
// First half
predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
- subsize, output_enabled, 0, 0);
+ subsize, dry_run, 0, 0);
if (bsize < top_bsize)
extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf,
- dst_stride);
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
else
extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf,
- dst_stride, 0);
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0);
if (mi_row + hbs < cm->mi_rows) {
// Second half
predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
mi_col, mi_row_top, mi_col_top, dst_buf1,
- dst_stride1, top_bsize, subsize, output_enabled, 0,
- 0);
+ dst_stride1, top_bsize, subsize, dry_run, 0, 0);
if (bsize < top_bsize)
extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
- mi_col, mi_row_top, mi_col_top, output_enabled, dst_buf1,
+ mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1,
dst_stride1);
else
extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
- mi_col, mi_row_top, mi_col_top, output_enabled, dst_buf1,
+ mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1,
dst_stride1, 1);
// Smooth
@@ -5705,20 +5712,18 @@
// First half
predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
- BLOCK_8X8, output_enabled, 1, 0);
+ BLOCK_8X8, dry_run, 1, 0);
if (bsize < top_bsize)
extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf,
- dst_stride);
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
// Second half
predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
mi_row_top, mi_col_top, dst_buf1, dst_stride1,
- top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+ top_bsize, BLOCK_8X8, dry_run, 1, 1);
if (bsize < top_bsize)
extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf1,
- dst_stride1);
+ mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
// Smooth
xd->plane[0].dst.buf = dst_buf[0];
@@ -5731,29 +5736,26 @@
// bsize: not important, not useful
predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
- subsize, output_enabled, 0, 0);
+ subsize, dry_run, 0, 0);
if (bsize < top_bsize)
extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf,
- dst_stride);
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
else
extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf,
- dst_stride, 3);
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3);
if (mi_col + hbs < cm->mi_cols) {
predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
- dst_stride1, top_bsize, subsize, output_enabled, 0,
- 0);
+ dst_stride1, top_bsize, subsize, dry_run, 0, 0);
if (bsize < top_bsize)
extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row,
- mi_col + hbs, mi_row_top, mi_col_top, output_enabled,
- dst_buf1, dst_stride1);
+ mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1,
+ dst_stride1);
else
extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row,
- mi_col + hbs, mi_row_top, mi_col_top, output_enabled,
- dst_buf1, dst_stride1, 2);
+ mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1,
+ dst_stride1, 2);
for (i = 0; i < MAX_MB_PLANE; i++) {
xd->plane[i].dst.buf = dst_buf[i];
@@ -5770,46 +5772,42 @@
if (bsize == BLOCK_8X8) {
predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
- BLOCK_8X8, output_enabled, 1, 0);
+ BLOCK_8X8, dry_run, 1, 0);
predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
mi_row_top, mi_col_top, dst_buf1, dst_stride1,
- top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+ top_bsize, BLOCK_8X8, dry_run, 1, 1);
predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
mi_row_top, mi_col_top, dst_buf2, dst_stride2,
- top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+ top_bsize, BLOCK_8X8, dry_run, 1, 1);
predict_b_extend(cpi, td, tile, 3, mi_row, mi_col, mi_row, mi_col,
mi_row_top, mi_col_top, dst_buf3, dst_stride3,
- top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+ top_bsize, BLOCK_8X8, dry_run, 1, 1);
if (bsize < top_bsize) {
extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf,
- dst_stride);
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf1,
- dst_stride1);
+ mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf2,
- dst_stride2);
+ mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
extend_all(cpi, td, tile, 3, subsize, top_bsize, mi_row, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf3,
- dst_stride3);
+ mi_row_top, mi_col_top, dry_run, dst_buf3, dst_stride3);
}
} else {
predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row_top,
- mi_col_top, output_enabled, subsize, top_bsize,
- dst_buf, dst_stride, pc_tree->split[0]);
+ mi_col_top, dry_run, subsize, top_bsize, dst_buf,
+ dst_stride, pc_tree->split[0]);
if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
predict_sb_complex(cpi, td, tile, mi_row, mi_col + hbs, mi_row_top,
- mi_col_top, output_enabled, subsize, top_bsize,
- dst_buf1, dst_stride1, pc_tree->split[1]);
+ mi_col_top, dry_run, subsize, top_bsize, dst_buf1,
+ dst_stride1, pc_tree->split[1]);
if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col, mi_row_top,
- mi_col_top, output_enabled, subsize, top_bsize,
- dst_buf2, dst_stride2, pc_tree->split[2]);
+ mi_col_top, dry_run, subsize, top_bsize, dst_buf2,
+ dst_stride2, pc_tree->split[2]);
if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col + hbs,
- mi_row_top, mi_col_top, output_enabled, subsize,
+ mi_row_top, mi_col_top, dry_run, subsize,
top_bsize, dst_buf3, dst_stride3,
pc_tree->split[3]);
}
@@ -5843,27 +5841,25 @@
case PARTITION_HORZ_A:
predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
- bsize2, output_enabled, 0, 0);
+ bsize2, dry_run, 0, 0);
extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
- dst_stride1, top_bsize, bsize2, output_enabled, 0, 0);
+ dst_stride1, top_bsize, bsize2, dry_run, 0, 0);
extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
- mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+ mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2,
- top_bsize, subsize, output_enabled, 0, 0);
+ top_bsize, subsize, dry_run, 0, 0);
if (bsize < top_bsize)
extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf2,
- dst_stride2);
+ mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
else
extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf2,
- dst_stride2, 1);
+ mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 1);
for (i = 0; i < MAX_MB_PLANE; i++) {
xd->plane[i].dst.buf = dst_buf[i];
@@ -5885,27 +5881,25 @@
predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
- bsize2, output_enabled, 0, 0);
+ bsize2, dry_run, 0, 0);
extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
- top_bsize, bsize2, output_enabled, 0, 0);
+ top_bsize, bsize2, dry_run, 0, 0);
extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+ mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
- dst_stride2, top_bsize, subsize, output_enabled, 0, 0);
+ dst_stride2, top_bsize, subsize, dry_run, 0, 0);
if (bsize < top_bsize)
extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
- mi_row_top, mi_col_top, output_enabled, dst_buf2,
- dst_stride2);
+ mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
else
extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
- mi_row_top, mi_col_top, output_enabled, dst_buf2,
- dst_stride2, 2);
+ mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 2);
for (i = 0; i < MAX_MB_PLANE; i++) {
xd->plane[i].dst.buf = dst_buf[i];
@@ -5926,27 +5920,25 @@
predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
- subsize, output_enabled, 0, 0);
+ subsize, dry_run, 0, 0);
if (bsize < top_bsize)
extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
else
extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride,
- 0);
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0);
predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
- top_bsize, bsize2, output_enabled, 0, 0);
+ top_bsize, bsize2, dry_run, 0, 0);
extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+ mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
- dst_buf2, dst_stride2, top_bsize, bsize2, output_enabled,
- 0, 0);
+ dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0);
extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
- mi_col + hbs, mi_row_top, mi_col_top, output_enabled, dst_buf2,
+ mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2,
dst_stride2);
for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -5970,27 +5962,25 @@
predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
- subsize, output_enabled, 0, 0);
+ subsize, dry_run, 0, 0);
if (bsize < top_bsize)
extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
else
extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
- mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride,
- 3);
+ mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3);
predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
- dst_stride1, top_bsize, bsize2, output_enabled, 0, 0);
+ dst_stride1, top_bsize, bsize2, dry_run, 0, 0);
extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
- mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+ mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
- dst_buf2, dst_stride2, top_bsize, bsize2, output_enabled,
- 0, 0);
+ dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0);
extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
- mi_col + hbs, mi_row_top, mi_col_top, output_enabled, dst_buf2,
+ mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2,
dst_stride2);
for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -6046,13 +6036,13 @@
set_skip_context(xd, mi_row, mi_col);
set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
- update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, 0, pc_tree);
+ update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, 1, pc_tree);
av1_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
for (plane = 0; plane < MAX_MB_PLANE; plane++) {
dst_buf[plane] = xd->plane[plane].dst.buf;
dst_stride[plane] = xd->plane[plane].dst.stride;
}
- predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, 0, bsize,
+ predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, 1, bsize,
bsize, dst_buf, dst_stride, pc_tree);
set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index ddd3cc5..acb5498 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -3086,15 +3086,15 @@
cpi->lst_fb_idxes[ref_frame] = cpi->lst_fb_idxes[ref_frame - 1];
// [0] is allocated to the current coded frame. The statistics for the
- // reference frames start at [1].
+ // reference frames start at [LAST_FRAME], i.e. [1].
if (!cpi->rc.is_src_frame_alt_ref) {
- memcpy(cpi->interp_filter_selected[ref_frame + 1],
- cpi->interp_filter_selected[ref_frame],
- sizeof(cpi->interp_filter_selected[ref_frame]));
+ memcpy(cpi->interp_filter_selected[ref_frame + LAST_FRAME],
+ cpi->interp_filter_selected[ref_frame - 1 + LAST_FRAME],
+ sizeof(cpi->interp_filter_selected[ref_frame - 1 + LAST_FRAME]));
}
}
}
-#endif
+#endif // CONFIG_EXT_REFS
void av1_update_reference_frames(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
@@ -3181,14 +3181,12 @@
int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
shift_last_ref_frames(cpi);
-
cpi->lst_fb_idxes[0] = cpi->bwd_fb_idx;
- if (!cpi->rc.is_src_frame_alt_ref) {
- memcpy(cpi->interp_filter_selected[0],
- cpi->interp_filter_selected[BWDREF_FRAME],
- sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
- }
cpi->bwd_fb_idx = tmp;
+
+ memcpy(cpi->interp_filter_selected[LAST_FRAME],
+ cpi->interp_filter_selected[BWDREF_FRAME],
+ sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
} else if (cpi->rc.is_src_frame_ext_arf && cm->show_existing_frame) {
// Deal with the special case for showing existing internal ALTREF_FRAME
// Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME
@@ -3198,15 +3196,15 @@
int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
shift_last_ref_frames(cpi);
-
cpi->lst_fb_idxes[0] = cpi->alt_fb_idx;
+ cpi->alt_fb_idx = tmp;
+
+ // We need to modify the mapping accordingly
+ cpi->arf_map[which_arf] = cpi->alt_fb_idx;
+
memcpy(cpi->interp_filter_selected[LAST_FRAME],
cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
sizeof(cpi->interp_filter_selected[ALTREF_FRAME + which_arf]));
-
- cpi->alt_fb_idx = tmp;
- // We need to modify the mapping accordingly
- cpi->arf_map[which_arf] = cpi->alt_fb_idx;
#endif // CONFIG_EXT_REFS
} else { /* For non key/golden frames */
if (cpi->refresh_alt_ref_frame) {
@@ -3241,22 +3239,12 @@
uref_cnt_fb(cpi->upsampled_ref_bufs,
&cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
- if (!cpi->rc.is_src_frame_alt_ref) {
+#if !CONFIG_EXT_REFS
+ if (!cpi->rc.is_src_frame_alt_ref)
+#endif // !CONFIG_EXT_REFS
memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
cpi->interp_filter_selected[0],
sizeof(cpi->interp_filter_selected[0]));
- } else {
- int which_arf = 0;
-#if CONFIG_EXT_REFS
- if (cpi->oxcf.pass == 2) {
- const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
- which_arf = gf_group->arf_update_idx[gf_group->index];
- }
-#endif // CONFIG_EXT_REFS
- memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
- cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
- sizeof(cpi->interp_filter_selected[ALTREF_FRAME + which_arf]));
- }
}
#if CONFIG_EXT_REFS
@@ -3271,6 +3259,7 @@
cpi->alt_fb_idx = cpi->bwd_fb_idx;
cpi->bwd_fb_idx = tmp;
}
+
ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx],
cm->new_fb_idx);
if (use_upsampled_ref)
@@ -3354,20 +3343,14 @@
tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
shift_last_ref_frames(cpi);
-
cpi->lst_fb_idxes[0] = tmp;
- if (!cpi->rc.is_src_frame_alt_ref) {
- if (cm->show_existing_frame) {
- memcpy(cpi->interp_filter_selected[LAST_FRAME],
- cpi->interp_filter_selected[BWDREF_FRAME],
- sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
- } else {
- memcpy(cpi->interp_filter_selected[LAST_FRAME],
- cpi->interp_filter_selected[0],
- sizeof(cpi->interp_filter_selected[0]));
- }
- }
+ assert(cm->show_existing_frame == 0);
+ // NOTE: Currently only LF_UPDATE and INTNL_OVERLAY_UPDATE frames are to
+ // refresh the LAST_FRAME.
+ memcpy(cpi->interp_filter_selected[LAST_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
}
#else
ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
@@ -3425,12 +3408,23 @@
}
#if CONFIG_CLPF
cm->clpf_strength_y = cm->clpf_strength_u = cm->clpf_strength_v = 0;
- cm->clpf_size = 2;
- CHECK_MEM_ERROR(
- cm, cm->clpf_blocks,
- aom_malloc(((cm->frame_to_show->y_crop_width + 31) & ~31) *
- ((cm->frame_to_show->y_crop_height + 31) & ~31) >>
- 10));
+ cm->clpf_size = CLPF_64X64;
+
+ // Allocate buffer to hold the status of all filter blocks:
+ // 1 = On, 0 = off, -1 = implicitly off
+ {
+ int size;
+ cm->clpf_stride = ((cm->frame_to_show->y_crop_width + MIN_FB_SIZE - 1) &
+ ~(MIN_FB_SIZE - 1)) >>
+ MIN_FB_SIZE_LOG2;
+ size = cm->clpf_stride *
+ ((cm->frame_to_show->y_crop_height + MIN_FB_SIZE - 1) &
+ ~(MIN_FB_SIZE - 1)) >>
+ MIN_FB_SIZE_LOG2;
+ CHECK_MEM_ERROR(cm, cm->clpf_blocks, aom_malloc(size));
+ memset(cm->clpf_blocks, CLPF_NOFLAG, size);
+ }
+
if (!is_lossless_requested(&cpi->oxcf)) {
const YV12_BUFFER_CONFIG *const frame = cm->frame_to_show;
@@ -3445,20 +3439,18 @@
// Apply the filter using the chosen strength
cm->clpf_strength_y = strength_y - (strength_y == 4);
cm->clpf_size =
- fb_size_log2 ? fb_size_log2 - get_msb(MAX_FB_SIZE) + 3 : 0;
- cm->clpf_numblocks = av1_clpf_frame(
- frame, cpi->Source, cm, !!cm->clpf_size, strength_y,
- 4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, av1_clpf_decision);
+ fb_size_log2 ? fb_size_log2 - MAX_FB_SIZE_LOG2 + 3 : CLPF_NOSIZE;
+ av1_clpf_frame(frame, cpi->Source, cm, cm->clpf_size != CLPF_NOSIZE,
+ strength_y, 4 + cm->clpf_size, AOM_PLANE_Y,
+ av1_clpf_decision);
}
if (strength_u) {
cm->clpf_strength_u = strength_u - (strength_u == 4);
- av1_clpf_frame(frame, NULL, cm, 0, strength_u, 4, NULL, AOM_PLANE_U,
- NULL);
+ av1_clpf_frame(frame, NULL, cm, 0, strength_u, 4, AOM_PLANE_U, NULL);
}
if (strength_v) {
cm->clpf_strength_v = strength_v - (strength_v == 4);
- av1_clpf_frame(frame, NULL, cm, 0, strength_v, 4, NULL, AOM_PLANE_V,
- NULL);
+ av1_clpf_frame(frame, NULL, cm, 0, strength_v, 4, AOM_PLANE_V, NULL);
}
}
#endif
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 5821d3f..9fdf540 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -2585,10 +2585,11 @@
if (cpi->num_extra_arfs) {
int tmp = cpi->bwd_fb_idx;
- cpi->rc.is_bwd_ref_frame = 1;
cpi->bwd_fb_idx = cpi->alt_fb_idx;
cpi->alt_fb_idx = cpi->arf_map[0];
cpi->arf_map[0] = tmp;
+
+ cpi->rc.is_bwd_ref_frame = 1;
} else {
cpi->rc.is_bwd_ref_frame = 0;
}
@@ -2639,11 +2640,13 @@
// NOTE: The indices will be swapped back after this frame is encoded
// (in av1_update_reference_frames()).
int tmp = cpi->bwd_fb_idx;
+
cpi->bwd_fb_idx = cpi->alt_fb_idx;
cpi->alt_fb_idx = cpi->arf_map[0];
cpi->arf_map[0] = tmp;
}
break;
+
case LAST_BIPRED_UPDATE:
cpi->refresh_last_frame = 0;
cpi->refresh_golden_frame = 0;
diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index 9589a48..1103c4b 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -180,16 +180,14 @@
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
- av1_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
+ av1_fht32x32(src_diff, coeff, diff_stride, tx_type);
break;
case V_DCT:
case H_DCT:
case V_ADST:
case H_ADST:
case V_FLIPADST:
- case H_FLIPADST:
- av1_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
- break;
+ case H_FLIPADST: av1_fht32x32(src_diff, coeff, diff_stride, tx_type); break;
case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type); break;
#endif // CONFIG_EXT_TX
default: assert(0); break;
diff --git a/av1/encoder/pickdering.c b/av1/encoder/pickdering.c
index 726a4c1..4ef83cd 100644
--- a/av1/encoder/pickdering.c
+++ b/av1/encoder/pickdering.c
@@ -10,6 +10,7 @@
*/
#include <string.h>
+#include <math.h>
#include "./aom_scale_rtcd.h"
#include "av1/common/dering.h"
@@ -46,12 +47,8 @@
int bsize[3];
int dec[3];
int pli;
- int(*mse)[MAX_DERING_LEVEL];
- double tot_mse[MAX_DERING_LEVEL] = { 0 };
int level;
int best_level;
- int global_level;
- double best_tot_mse = 1e15;
int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
src = aom_malloc(sizeof(*src) * cm->mi_rows * cm->mi_cols * 64);
ref_coeff = aom_malloc(sizeof(*ref_coeff) * cm->mi_rows * cm->mi_cols * 64);
@@ -89,68 +86,47 @@
}
nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
- mse = aom_malloc(nvsb * nhsb * sizeof(*mse));
+ /* Pick a base threshold based on the quantizer. The threshold will then be
+ adjusted on a 64x64 basis. We use a threshold of the form T = a*Q^b,
+ where a and b are derived empirically trying to optimize rate-distortion
+ at different quantizer settings. */
+ best_level = AOMMIN(
+ MAX_DERING_LEVEL - 1,
+ (int)floor(.5 +
+ .45 * pow(av1_ac_quant(cm->base_qindex, 0, cm->bit_depth) >>
+ (cm->bit_depth - 8),
+ 0.6)));
for (sbr = 0; sbr < nvsb; sbr++) {
for (sbc = 0; sbc < nhsb; sbc++) {
int nvb, nhb;
+ int gi;
+ int best_gi;
+ int32_t best_mse = INT32_MAX;
int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
- for (level = 0; level < 64; level++) {
+ if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue;
+ best_gi = 0;
+ for (gi = 0; gi < DERING_REFINEMENT_LEVELS; gi++) {
int cur_mse;
int threshold;
+ level = compute_level_from_index(best_level, gi);
threshold = level << coeff_shift;
- od_dering(
- &OD_DERING_VTBL_C, dst, MAX_MIB_SIZE * bsize[0],
- &src[sbr * stride * bsize[0] * MAX_MIB_SIZE +
- sbc * bsize[0] * MAX_MIB_SIZE],
- cm->mi_cols * bsize[0], nhb, nvb, sbc, sbr, nhsb, nvsb, 0, dir, 0,
- &bskip[MAX_MIB_SIZE * sbr * cm->mi_cols + MAX_MIB_SIZE * sbc],
- cm->mi_cols, threshold, OD_DERING_NO_CHECK_OVERLAP, coeff_shift);
+ od_dering(dst, MAX_MIB_SIZE * bsize[0],
+ &src[sbr * stride * bsize[0] * MAX_MIB_SIZE +
+ sbc * bsize[0] * MAX_MIB_SIZE],
+ cm->mi_cols * bsize[0], nhb, nvb, sbc, sbr, nhsb, nvsb, 0,
+ dir, 0,
+ &bskip[MAX_MIB_SIZE * sbr * cm->mi_cols + MAX_MIB_SIZE * sbc],
+ cm->mi_cols, threshold, coeff_shift);
cur_mse = (int)compute_dist(
dst, MAX_MIB_SIZE * bsize[0],
&ref_coeff[sbr * stride * bsize[0] * MAX_MIB_SIZE +
sbc * bsize[0] * MAX_MIB_SIZE],
stride, nhb, nvb, coeff_shift);
- mse[nhsb * sbr + sbc][level] = cur_mse;
- tot_mse[level] += cur_mse;
- }
- }
- }
-#if DERING_REFINEMENT
- best_level = 0;
- /* Search for the best global level one value at a time. */
- for (global_level = 2; global_level < MAX_DERING_LEVEL; global_level++) {
- double tot_mse = 0;
- for (sbr = 0; sbr < nvsb; sbr++) {
- for (sbc = 0; sbc < nhsb; sbc++) {
- int gi;
- int best_mse = mse[nhsb * sbr + sbc][0];
- for (gi = 1; gi < 4; gi++) {
- level = compute_level_from_index(global_level, gi);
- if (mse[nhsb * sbr + sbc][level] < best_mse) {
- best_mse = mse[nhsb * sbr + sbc][level];
- }
- }
- tot_mse += best_mse;
- }
- }
- if (tot_mse < best_tot_mse) {
- best_level = global_level;
- best_tot_mse = tot_mse;
- }
- }
- for (sbr = 0; sbr < nvsb; sbr++) {
- for (sbc = 0; sbc < nhsb; sbc++) {
- int gi;
- int best_gi;
- int best_mse = mse[nhsb * sbr + sbc][0];
- best_gi = 0;
- for (gi = 1; gi < DERING_REFINEMENT_LEVELS; gi++) {
- level = compute_level_from_index(best_level, gi);
- if (mse[nhsb * sbr + sbc][level] < best_mse) {
+ if (cur_mse < best_mse) {
best_gi = gi;
- best_mse = mse[nhsb * sbr + sbc][level];
+ best_mse = cur_mse;
}
}
cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
@@ -158,15 +134,8 @@
->mbmi.dering_gain = best_gi;
}
}
-#else
- best_level = 0;
- for (level = 0; level < MAX_DERING_LEVEL; level++) {
- if (tot_mse[level] < tot_mse[best_level]) best_level = level;
- }
-#endif
aom_free(src);
aom_free(ref_coeff);
aom_free(bskip);
- aom_free(mse);
return best_level;
}
diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index ee65107..ff96714 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c
@@ -153,9 +153,6 @@
}
void av1_fill_token_costs(av1_coeff_cost *c,
-#if CONFIG_ANS
- coeff_cdf_model (*cdf)[PLANE_TYPES],
-#endif // CONFIG_ANS
av1_coeff_probs_model (*p)[PLANE_TYPES]) {
int i, j, k, l;
TX_SIZE t;
@@ -164,19 +161,11 @@
for (j = 0; j < REF_TYPES; ++j)
for (k = 0; k < COEF_BANDS; ++k)
for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
-#if CONFIG_ANS
- const aom_prob *const tree_probs = p[t][i][j][k][l];
- av1_cost_tokens_ans((int *)c[t][i][j][k][0][l], tree_probs,
- cdf[t][i][j][k][l], 0);
- av1_cost_tokens_ans((int *)c[t][i][j][k][1][l], tree_probs,
- cdf[t][i][j][k][l], 1);
-#else
aom_prob probs[ENTROPY_NODES];
av1_model_to_full_probs(p[t][i][j][k][l], probs);
av1_cost_tokens((int *)c[t][i][j][k][0][l], probs, av1_coef_tree);
av1_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
av1_coef_tree);
-#endif // CONFIG_ANS
assert(c[t][i][j][k][0][l][EOB_TOKEN] ==
c[t][i][j][k][1][l][EOB_TOKEN]);
}
@@ -387,11 +376,7 @@
#endif
}
if (cpi->oxcf.pass != 1) {
- av1_fill_token_costs(x->token_costs,
-#if CONFIG_ANS
- cm->fc->coef_cdfs,
-#endif // CONFIG_ANS
- cm->fc->coef_probs);
+ av1_fill_token_costs(x->token_costs, cm->fc->coef_probs);
if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
cm->frame_type == KEY_FRAME) {
diff --git a/av1/encoder/rd.h b/av1/encoder/rd.h
index 933733b..3ca4768 100644
--- a/av1/encoder/rd.h
+++ b/av1/encoder/rd.h
@@ -431,9 +431,6 @@
int best_mode_index);
void av1_fill_token_costs(av1_coeff_cost *c,
-#if CONFIG_ANS
- coeff_cdf_model (*cdf)[PLANE_TYPES],
-#endif // CONFIG_ANS
av1_coeff_probs_model (*p)[PLANE_TYPES]);
static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 78d61e2..8707061 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -46,6 +46,7 @@
#include "av1/encoder/ratectrl.h"
#include "av1/encoder/rd.h"
#include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
#if CONFIG_DUAL_FILTER
#if CONFIG_EXT_INTERP
@@ -865,14 +866,14 @@
}
#endif // CONFIG_AOM_HIGHBITDEPTH
-/* The trailing '0' is a terminator which is used inside cost_coeffs() to
+/* The trailing '0' is a terminator which is used inside av1_cost_coeffs() to
* decide whether to include cost of a trailing EOB node or not (i.e. we
* can skip this if the last coefficient in this transform block, e.g. the
* 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
* were non-zero). */
-static int cost_coeffs(MACROBLOCK *x, int plane, int block, int coeff_ctx,
- TX_SIZE tx_size, const int16_t *scan, const int16_t *nb,
- int use_fast_coef_costing) {
+int av1_cost_coeffs(MACROBLOCK *x, int plane, int block, int coeff_ctx,
+ TX_SIZE tx_size, const int16_t *scan, const int16_t *nb,
+ int use_fast_coef_costing) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
const struct macroblock_plane *p = &x->plane[plane];
@@ -1064,8 +1065,9 @@
static int rate_block(int plane, int block, int coeff_ctx, TX_SIZE tx_size,
struct rdcost_block_args *args) {
- return cost_coeffs(args->x, plane, block, coeff_ctx, tx_size, args->so->scan,
- args->so->neighbors, args->use_fast_coef_costing);
+ return av1_cost_coeffs(args->x, plane, block, coeff_ctx, tx_size,
+ args->so->scan, args->so->neighbors,
+ args->use_fast_coef_costing);
}
static uint64_t sum_squares_2d(const int16_t *diff, int diff_stride,
@@ -1946,8 +1948,9 @@
av1_xform_quant(x, 0, block, row + idy, col + idx, BLOCK_8X8,
TX_4X4, AV1_XFORM_QUANT_FP);
#endif // CONFIG_NEW_QUANT
- ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
- so->neighbors, cpi->sf.use_fast_coef_costing);
+ ratey +=
+ av1_cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+ so->neighbors, cpi->sf.use_fast_coef_costing);
*(tempa + idx) = !(p->eobs[block] == 0);
*(templ + idy) = !(p->eobs[block] == 0);
can_skip &= (p->eobs[block] == 0);
@@ -1971,8 +1974,9 @@
TX_4X4, AV1_XFORM_QUANT_FP);
#endif // CONFIG_NEW_QUANT
av1_optimize_b(x, 0, block, TX_4X4, coeff_ctx);
- ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
- so->neighbors, cpi->sf.use_fast_coef_costing);
+ ratey +=
+ av1_cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+ so->neighbors, cpi->sf.use_fast_coef_costing);
*(tempa + idx) = !(p->eobs[block] == 0);
*(templ + idy) = !(p->eobs[block] == 0);
can_skip &= (p->eobs[block] == 0);
@@ -2064,8 +2068,9 @@
av1_xform_quant(x, 0, block, row + idy, col + idx, BLOCK_8X8, TX_4X4,
AV1_XFORM_QUANT_B);
#endif // CONFIG_NEW_QUANT
- ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
- so->neighbors, cpi->sf.use_fast_coef_costing);
+ ratey +=
+ av1_cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+ so->neighbors, cpi->sf.use_fast_coef_costing);
*(tempa + idx) = !(p->eobs[block] == 0);
*(templ + idy) = !(p->eobs[block] == 0);
can_skip &= (p->eobs[block] == 0);
@@ -2088,8 +2093,9 @@
AV1_XFORM_QUANT_FP);
#endif // CONFIG_NEW_QUANT
av1_optimize_b(x, 0, block, TX_4X4, coeff_ctx);
- ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
- so->neighbors, cpi->sf.use_fast_coef_costing);
+ ratey +=
+ av1_cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+ so->neighbors, cpi->sf.use_fast_coef_costing);
*(tempa + idx) = !(p->eobs[block] == 0);
*(templ + idy) = !(p->eobs[block] == 0);
can_skip &= (p->eobs[block] == 0);
@@ -2964,8 +2970,8 @@
}
}
*dist += tmp * 16;
- *rate += cost_coeffs(x, plane, block, coeff_ctx, tx_size, scan_order->scan,
- scan_order->neighbors, 0);
+ *rate += av1_cost_coeffs(x, plane, block, coeff_ctx, tx_size,
+ scan_order->scan, scan_order->neighbors, 0);
*skip &= (p->eobs[block] == 0);
}
@@ -4374,8 +4380,8 @@
&dist, &ssz);
thisdistortion += dist;
thissse += ssz;
- thisrate += cost_coeffs(x, 0, block, coeff_ctx, tx_size, so->scan,
- so->neighbors, cpi->sf.use_fast_coef_costing);
+ thisrate += av1_cost_coeffs(x, 0, block, coeff_ctx, tx_size, so->scan,
+ so->neighbors, cpi->sf.use_fast_coef_costing);
*(ta + (k & 1)) = !(p->eobs[block] == 0);
*(tl + (k >> 1)) = !(p->eobs[block] == 0);
#if CONFIG_EXT_TX
diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index eb0ff9f..584c439 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h
@@ -26,6 +26,9 @@
struct macroblock;
struct RD_COST;
+int av1_cost_coeffs(MACROBLOCK *x, int plane, int block, int coeff_ctx,
+ TX_SIZE tx_size, const int16_t *scan, const int16_t *nb,
+ int use_fast_coef_costing);
void av1_rd_pick_intra_mode_sb(struct AV1_COMP *cpi, struct macroblock *x,
struct RD_COST *rd_cost, BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx, int64_t best_rd);
diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index 3bf2410..8095681 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c
@@ -23,6 +23,7 @@
#include "av1/encoder/cost.h"
#include "av1/encoder/encoder.h"
+#include "av1/encoder/rdopt.h"
#include "av1/encoder/tokenize.h"
static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
@@ -346,8 +347,31 @@
AV1_COMP *cpi;
ThreadData *td;
TOKENEXTRA **tp;
+ int this_rate;
};
+static void cost_coeffs_b(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+ struct tokenize_b_args *const args = arg;
+ ThreadData *const td = args->td;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+ struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ const PLANE_TYPE type = pd->plane_type;
+ const int ref = is_inter_block(mbmi);
+ const TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size);
+ const scan_order *const so = get_scan(tx_size, tx_type, ref);
+ int pt = get_entropy_context(tx_size, pd->above_context + blk_col,
+ pd->left_context + blk_row);
+ int rate =
+ av1_cost_coeffs(x, plane, block, pt, tx_size, so->scan, so->neighbors, 0);
+ args->this_rate += rate;
+ av1_set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0, blk_col,
+ blk_row);
+}
+
static void set_entropy_context_b(int plane, int block, int blk_row,
int blk_col, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
@@ -363,7 +387,7 @@
static INLINE void add_token(TOKENEXTRA **t, const aom_prob *context_tree,
#if CONFIG_ANS
- const rans_lut *token_cdf,
+ const aom_cdf_prob (*token_cdf)[ENTROPY_TOKENS],
#endif // CONFIG_ANS
int32_t extra, uint8_t token,
uint8_t skip_eob_node, unsigned int *counts) {
@@ -378,25 +402,15 @@
++counts[token];
}
-static INLINE void add_token_no_extra(TOKENEXTRA **t,
- const aom_prob *context_tree,
- uint8_t token, uint8_t skip_eob_node,
- unsigned int *counts) {
- (*t)->token = token;
- (*t)->context_tree = context_tree;
- (*t)->skip_eob_node = skip_eob_node;
- (*t)++;
- ++counts[token];
-}
-
static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
TX_SIZE tx_size) {
const int eob_max = num_4x4_blocks_txsize_lookup[tx_size] << 4;
return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
}
-void av1_tokenize_palette_sb(struct ThreadData *const td, BLOCK_SIZE bsize,
- int plane, TOKENEXTRA **t) {
+void av1_tokenize_palette_sb(AV1_COMP *cpi, struct ThreadData *const td,
+ int plane, TOKENEXTRA **t, RUN_TYPE dry_run,
+ BLOCK_SIZE bsize, int *rate) {
MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
@@ -404,7 +418,8 @@
PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
int n = pmi->palette_size[plane != 0];
int i, j, k;
- int color_new_idx = -1, color_ctx, color_order[PALETTE_MAX_SIZE];
+ int this_rate = 0;
+ int color_idx = -1, color_ctx, color_order[PALETTE_MAX_SIZE];
const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
(xd->plane[plane != 0].subsampling_y);
const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
@@ -419,16 +434,19 @@
av1_get_palette_color_context(color_map, cols, i, j, n, color_order);
for (k = 0; k < n; ++k)
if (color_map[i * cols + j] == color_order[k]) {
- color_new_idx = k;
+ color_idx = k;
break;
}
- assert(color_new_idx >= 0 && color_new_idx < n);
- (*t)->token = color_new_idx;
+ assert(color_idx >= 0 && color_idx < n);
+ if (dry_run == DRY_RUN_COSTCOEFFS)
+ this_rate += cpi->palette_y_color_cost[n - 2][color_ctx][color_idx];
+ (*t)->token = color_idx;
(*t)->context_tree = probs[n - 2][color_ctx];
(*t)->skip_eob_node = 0;
++(*t);
}
}
+ if (rate) *rate += this_rate;
}
static void tokenize_b(int plane, int block, int blk_row, int blk_col,
@@ -469,8 +487,8 @@
cpi->common.fc->coef_probs[txsize_sqr_map[tx_size]][type][ref];
#endif // CONFIG_ENTROPY
#if CONFIG_ANS
- rans_lut(*const coef_cdfs)[COEFF_CONTEXTS] =
- cpi->common.fc->coef_cdfs[txsize_sqr_map[tx_size]][type][ref];
+ aom_cdf_prob(*const coef_cdfs)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
+ cpi->common.fc->coef_cdfs[tx_size][type][ref];
#endif // CONFIG_ANS
unsigned int(*const eob_branch)[COEFF_CONTEXTS] =
td->counts->eob_branch[txsize_sqr_map[tx_size]][type][ref];
@@ -493,7 +511,7 @@
add_token(&t, coef_probs[band[c]][pt],
#if CONFIG_ANS
- (const rans_lut *)&coef_cdfs[band[c]][pt],
+ (const aom_cdf_prob(*)[ENTROPY_TOKENS]) & coef_cdfs[band[c]][pt],
#endif // CONFIG_ANS
extra, (uint8_t)token, (uint8_t)skip_eob, counts[band[c]][pt]);
@@ -503,8 +521,11 @@
skip_eob = (token == ZERO_TOKEN);
}
if (c < seg_eob) {
- add_token_no_extra(&t, coef_probs[band[c]][pt], EOB_TOKEN, 0,
- counts[band[c]][pt]);
+ add_token(&t, coef_probs[band[c]][pt],
+#if CONFIG_ANS || CONFIG_DAALA_EC
+ NULL,
+#endif
+ 0, EOB_TOKEN, 0, counts[band[c]][pt]);
++eob_branch[band[c]][pt];
}
@@ -560,7 +581,7 @@
}
#if CONFIG_VAR_TX
-void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, int dry_run,
+void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int blk_row,
int blk_col, int block, int plane, void *arg) {
MACROBLOCK *const x = &td->mb;
@@ -593,9 +614,11 @@
BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->sb_type, pd);
if (!dry_run)
tokenize_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
- else
+ else if (dry_run == DRY_RUN_NORMAL)
set_entropy_context_b(plane, block, blk_row, blk_col, plane_bsize,
tx_size, arg);
+ else if (dry_run == DRY_RUN_COSTCOEFFS)
+ cost_coeffs_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
} else {
int bsl = b_width_log2_lookup[bsize];
int i;
@@ -617,8 +640,8 @@
}
void av1_tokenize_sb_vartx(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
- int dry_run, int mi_row, int mi_col,
- BLOCK_SIZE bsize) {
+ RUN_TYPE dry_run, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int *rate) {
AV1_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -627,7 +650,7 @@
const int ctx = av1_get_skip_context(xd);
const int skip_inc =
!segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
- struct tokenize_b_args arg = { cpi, td, t };
+ struct tokenize_b_args arg = { cpi, td, t, 0 };
int plane;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
@@ -667,11 +690,12 @@
(*t)++;
}
}
+ if (rate) *rate += arg.this_rate;
}
#endif // CONFIG_VAR_TX
-void av1_tokenize_sb(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t, int dry_run,
- BLOCK_SIZE bsize) {
+void av1_tokenize_sb(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate) {
AV1_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -679,7 +703,7 @@
const int ctx = av1_get_skip_context(xd);
const int skip_inc =
!segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
- struct tokenize_b_args arg = { cpi, td, t };
+ struct tokenize_b_args arg = { cpi, td, t, 0 };
if (mbmi->skip) {
if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
reset_skip_context(xd, bsize);
@@ -697,14 +721,17 @@
(*t)->token = EOSB_TOKEN;
(*t)++;
}
- } else {
+ } else if (dry_run == DRY_RUN_NORMAL) {
av1_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
+ } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+ av1_foreach_transformed_block(xd, bsize, cost_coeffs_b, &arg);
}
+ if (rate) *rate += arg.this_rate;
}
#if CONFIG_SUPERTX
void av1_tokenize_sb_supertx(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
- int dry_run, BLOCK_SIZE bsize) {
+ RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate) {
AV1_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &td->mb.e_mbd;
MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
@@ -712,7 +739,7 @@
const int ctx = av1_get_skip_context(xd);
const int skip_inc =
!segfeature_active(&cm->seg, mbmi->segment_id_supertx, SEG_LVL_SKIP);
- struct tokenize_b_args arg = { cpi, td, t };
+ struct tokenize_b_args arg = { cpi, td, t, 0 };
if (mbmi->skip) {
if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
reset_skip_context(xd, bsize);
@@ -730,9 +757,12 @@
(*t)->token = EOSB_TOKEN;
(*t)++;
}
- } else {
+ } else if (dry_run == DRY_RUN_NORMAL) {
av1_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
*t = t_backup;
+ } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+ av1_foreach_transformed_block(xd, bsize, cost_coeffs_b, &arg);
}
+ if (rate) *rate += arg.this_rate;
}
#endif // CONFIG_SUPERTX
diff --git a/av1/encoder/tokenize.h b/av1/encoder/tokenize.h
index a7e30d5..f20848a 100644
--- a/av1/encoder/tokenize.h
+++ b/av1/encoder/tokenize.h
@@ -37,7 +37,7 @@
typedef struct {
const aom_prob *context_tree;
#if CONFIG_ANS
- const rans_lut *token_cdf;
+ const aom_cdf_prob (*token_cdf)[ENTROPY_TOKENS];
#endif // CONFIG_ANS
EXTRABIT extra;
uint8_t token;
@@ -56,19 +56,31 @@
struct AV1_COMP;
struct ThreadData;
+typedef enum {
+ OUTPUT_ENABLED = 0,
+ DRY_RUN_NORMAL,
+ DRY_RUN_COSTCOEFFS,
+} RUN_TYPE;
+
+// Note in all the tokenize functions rate if non NULL is incremented
+// with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS,
+// otherwise rate is not incremented.
#if CONFIG_VAR_TX
void av1_tokenize_sb_vartx(struct AV1_COMP *cpi, struct ThreadData *td,
- TOKENEXTRA **t, int dry_run, int mi_row, int mi_col,
- BLOCK_SIZE bsize);
+ TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, int *rate);
#endif
-void av1_tokenize_palette_sb(struct ThreadData *const td, BLOCK_SIZE bsize,
- int plane, TOKENEXTRA **t);
+void av1_tokenize_palette_sb(struct AV1_COMP *cpi, struct ThreadData *const td,
+ int plane, TOKENEXTRA **t, RUN_TYPE dry_run,
+ BLOCK_SIZE bsize, int *rate);
void av1_tokenize_sb(struct AV1_COMP *cpi, struct ThreadData *td,
- TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
+ TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ int *rate);
#if CONFIG_SUPERTX
void av1_tokenize_sb_supertx(struct AV1_COMP *cpi, struct ThreadData *td,
- TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
+ TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ int *rate);
#endif
extern const int16_t *av1_dct_value_cost_ptr;
diff --git a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
index b23d39d..69bf89a 100644
--- a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
@@ -25,8 +25,7 @@
*u = _mm256_permute2x128_si256(v, v, 1);
}
-void aom_fdct16x16_1_avx2(const int16_t *input, tran_low_t *output,
- int stride) {
+static int32_t get_16x16_sum(const int16_t *input, int stride) {
__m256i r0, r1, r2, r3, u0, u1;
__m256i zero = _mm256_setzero_si256();
__m256i sum = _mm256_setzero_si256();
@@ -61,8 +60,14 @@
_mm256_castsi256_si128(u1));
v1 = _mm_srli_si128(v0, 4);
v0 = _mm_add_epi32(v0, v1);
- v0 = _mm_srai_epi32(v0, 1);
- output[0] = (tran_low_t)_mm_extract_epi32(v0, 0);
+ return (int32_t)_mm_extract_epi32(v0, 0);
+}
+
+void aom_fdct16x16_1_avx2(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int32_t dc = get_16x16_sum(input, stride);
+ output[0] = (tran_low_t)(dc >> 1);
+ _mm256_zeroupper();
}
static void mm256_transpose_16x16(__m256i *in) {
@@ -559,8 +564,6 @@
x1 = _mm256_unpackhi_epi16(u3, u4);
in[13] = butter_fly(x0, x1, cospi_p06_p26);
in[3] = butter_fly(x0, x1, cospi_m26_p06);
-
- mm256_transpose_16x16(in);
}
void fadst16_avx2(__m256i *in) {
@@ -1105,8 +1108,6 @@
in[3] = _mm256_sub_epi16(zero, x4);
in[13] = _mm256_sub_epi16(zero, x13);
in[15] = _mm256_sub_epi16(zero, x1);
-
- mm256_transpose_16x16(in);
}
#if CONFIG_EXT_TX
@@ -1134,7 +1135,6 @@
in[i] = _mm256_packs_epi32(u0, u1);
i++;
}
- mm256_transpose_16x16(in);
}
#endif
@@ -1146,24 +1146,28 @@
case DCT_DCT:
load_buffer_16x16(input, stride, 0, 0, in);
fdct16_avx2(in);
+ mm256_transpose_16x16(in);
right_shift_16x16(in);
fdct16_avx2(in);
break;
case ADST_DCT:
load_buffer_16x16(input, stride, 0, 0, in);
fadst16_avx2(in);
+ mm256_transpose_16x16(in);
right_shift_16x16(in);
fdct16_avx2(in);
break;
case DCT_ADST:
load_buffer_16x16(input, stride, 0, 0, in);
fdct16_avx2(in);
+ mm256_transpose_16x16(in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
case ADST_ADST:
load_buffer_16x16(input, stride, 0, 0, in);
fadst16_avx2(in);
+ mm256_transpose_16x16(in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
@@ -1171,71 +1175,698 @@
case FLIPADST_DCT:
load_buffer_16x16(input, stride, 1, 0, in);
fadst16_avx2(in);
+ mm256_transpose_16x16(in);
right_shift_16x16(in);
fdct16_avx2(in);
break;
case DCT_FLIPADST:
load_buffer_16x16(input, stride, 0, 1, in);
fdct16_avx2(in);
+ mm256_transpose_16x16(in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
case FLIPADST_FLIPADST:
load_buffer_16x16(input, stride, 1, 1, in);
fadst16_avx2(in);
+ mm256_transpose_16x16(in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
case ADST_FLIPADST:
load_buffer_16x16(input, stride, 0, 1, in);
fadst16_avx2(in);
+ mm256_transpose_16x16(in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
case FLIPADST_ADST:
load_buffer_16x16(input, stride, 1, 0, in);
fadst16_avx2(in);
+ mm256_transpose_16x16(in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
case V_DCT:
load_buffer_16x16(input, stride, 0, 0, in);
fdct16_avx2(in);
+ mm256_transpose_16x16(in);
right_shift_16x16(in);
fidtx16_avx2(in);
break;
case H_DCT:
load_buffer_16x16(input, stride, 0, 0, in);
fidtx16_avx2(in);
+ mm256_transpose_16x16(in);
right_shift_16x16(in);
fdct16_avx2(in);
break;
case V_ADST:
load_buffer_16x16(input, stride, 0, 0, in);
fadst16_avx2(in);
+ mm256_transpose_16x16(in);
right_shift_16x16(in);
fidtx16_avx2(in);
break;
case H_ADST:
load_buffer_16x16(input, stride, 0, 0, in);
fidtx16_avx2(in);
+ mm256_transpose_16x16(in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
case V_FLIPADST:
load_buffer_16x16(input, stride, 1, 0, in);
fadst16_avx2(in);
+ mm256_transpose_16x16(in);
right_shift_16x16(in);
fidtx16_avx2(in);
break;
case H_FLIPADST:
load_buffer_16x16(input, stride, 0, 1, in);
fidtx16_avx2(in);
+ mm256_transpose_16x16(in);
right_shift_16x16(in);
fadst16_avx2(in);
break;
#endif // CONFIG_EXT_TX
default: assert(0); break;
}
+ mm256_transpose_16x16(in);
write_buffer_16x16(in, 16, output);
+ _mm256_zeroupper();
+}
+
+void aom_fdct32x32_1_avx2(const int16_t *input, tran_low_t *output,
+ int stride) {
+ // left and upper corner
+ int32_t sum = get_16x16_sum(input, stride);
+ // right and upper corner
+ sum += get_16x16_sum(input + 16, stride);
+ // left and lower corner
+ sum += get_16x16_sum(input + (stride << 4), stride);
+ // right and lower corner
+ sum += get_16x16_sum(input + (stride << 4) + 16, stride);
+
+ sum >>= 3;
+ output[0] = (tran_low_t)sum;
+ _mm256_zeroupper();
+}
+
+#if CONFIG_EXT_TX
+static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) {
+ int i = 0;
+ __m256i temp;
+ while (i < size) {
+ temp = a0[i];
+ a0[i] = a1[i];
+ a1[i] = temp;
+ i++;
+ }
+}
+
+static void mm256_transpose_32x32(__m256i *in0, __m256i *in1) {
+ mm256_transpose_16x16(in0);
+ mm256_transpose_16x16(&in0[16]);
+ mm256_transpose_16x16(in1);
+ mm256_transpose_16x16(&in1[16]);
+ mm256_vectors_swap(&in0[16], in1, 16);
+}
+
+static void prepare_16x16_even(const __m256i *in, __m256i *even) {
+ even[0] = _mm256_add_epi16(in[0], in[31]);
+ even[1] = _mm256_add_epi16(in[1], in[30]);
+ even[2] = _mm256_add_epi16(in[2], in[29]);
+ even[3] = _mm256_add_epi16(in[3], in[28]);
+ even[4] = _mm256_add_epi16(in[4], in[27]);
+ even[5] = _mm256_add_epi16(in[5], in[26]);
+ even[6] = _mm256_add_epi16(in[6], in[25]);
+ even[7] = _mm256_add_epi16(in[7], in[24]);
+ even[8] = _mm256_add_epi16(in[8], in[23]);
+ even[9] = _mm256_add_epi16(in[9], in[22]);
+ even[10] = _mm256_add_epi16(in[10], in[21]);
+ even[11] = _mm256_add_epi16(in[11], in[20]);
+ even[12] = _mm256_add_epi16(in[12], in[19]);
+ even[13] = _mm256_add_epi16(in[13], in[18]);
+ even[14] = _mm256_add_epi16(in[14], in[17]);
+ even[15] = _mm256_add_epi16(in[15], in[16]);
+}
+
+static void prepare_16x16_odd(const __m256i *in, __m256i *odd) {
+ odd[0] = _mm256_sub_epi16(in[15], in[16]);
+ odd[1] = _mm256_sub_epi16(in[14], in[17]);
+ odd[2] = _mm256_sub_epi16(in[13], in[18]);
+ odd[3] = _mm256_sub_epi16(in[12], in[19]);
+ odd[4] = _mm256_sub_epi16(in[11], in[20]);
+ odd[5] = _mm256_sub_epi16(in[10], in[21]);
+ odd[6] = _mm256_sub_epi16(in[9], in[22]);
+ odd[7] = _mm256_sub_epi16(in[8], in[23]);
+ odd[8] = _mm256_sub_epi16(in[7], in[24]);
+ odd[9] = _mm256_sub_epi16(in[6], in[25]);
+ odd[10] = _mm256_sub_epi16(in[5], in[26]);
+ odd[11] = _mm256_sub_epi16(in[4], in[27]);
+ odd[12] = _mm256_sub_epi16(in[3], in[28]);
+ odd[13] = _mm256_sub_epi16(in[2], in[29]);
+ odd[14] = _mm256_sub_epi16(in[1], in[30]);
+ odd[15] = _mm256_sub_epi16(in[0], in[31]);
+}
+
+static void collect_16col(const __m256i *even, const __m256i *odd,
+ __m256i *out) {
+ // fdct16_avx2() already maps the output
+ out[0] = even[0];
+ out[2] = even[1];
+ out[4] = even[2];
+ out[6] = even[3];
+ out[8] = even[4];
+ out[10] = even[5];
+ out[12] = even[6];
+ out[14] = even[7];
+ out[16] = even[8];
+ out[18] = even[9];
+ out[20] = even[10];
+ out[22] = even[11];
+ out[24] = even[12];
+ out[26] = even[13];
+ out[28] = even[14];
+ out[30] = even[15];
+
+ out[1] = odd[0];
+ out[17] = odd[1];
+ out[9] = odd[2];
+ out[25] = odd[3];
+ out[5] = odd[4];
+ out[21] = odd[5];
+ out[13] = odd[6];
+ out[29] = odd[7];
+ out[3] = odd[8];
+ out[19] = odd[9];
+ out[11] = odd[10];
+ out[27] = odd[11];
+ out[7] = odd[12];
+ out[23] = odd[13];
+ out[15] = odd[14];
+ out[31] = odd[15];
+}
+
+static void collect_coeffs(const __m256i *first_16col_even,
+ const __m256i *first_16col_odd,
+ const __m256i *second_16col_even,
+ const __m256i *second_16col_odd, __m256i *in0,
+ __m256i *in1) {
+ collect_16col(first_16col_even, first_16col_odd, in0);
+ collect_16col(second_16col_even, second_16col_odd, in1);
+}
+
+static void fdct16_odd_avx2(__m256i *in) {
+ // sequence: cospi_L_H = pairs(L, H) and L first
+ const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64);
+ const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
+ const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64);
+ const __m256i cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64);
+ const __m256i cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ const __m256i cospi_p31_p01 = pair256_set_epi16(cospi_31_64, cospi_1_64);
+ const __m256i cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64);
+ const __m256i cospi_p15_p17 = pair256_set_epi16(cospi_15_64, cospi_17_64);
+ const __m256i cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64);
+ const __m256i cospi_p23_p09 = pair256_set_epi16(cospi_23_64, cospi_9_64);
+ const __m256i cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64);
+ const __m256i cospi_p07_p25 = pair256_set_epi16(cospi_7_64, cospi_25_64);
+ const __m256i cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64);
+ const __m256i cospi_p27_p05 = pair256_set_epi16(cospi_27_64, cospi_5_64);
+ const __m256i cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
+ const __m256i cospi_p11_p21 = pair256_set_epi16(cospi_11_64, cospi_21_64);
+ const __m256i cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
+ const __m256i cospi_p19_p13 = pair256_set_epi16(cospi_19_64, cospi_13_64);
+ const __m256i cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64);
+ const __m256i cospi_p03_p29 = pair256_set_epi16(cospi_3_64, cospi_29_64);
+ const __m256i cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64);
+
+ __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+ __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15;
+ __m256i u0, u1;
+
+ // stage 1 is in prepare_16x16_odd()
+
+ // stage 2
+ y0 = in[0];
+ y1 = in[1];
+ y2 = in[2];
+ y3 = in[3];
+
+ u0 = _mm256_unpacklo_epi16(in[4], in[11]);
+ u1 = _mm256_unpackhi_epi16(in[4], in[11]);
+ y4 = butter_fly(u0, u1, cospi_m16_p16);
+ y11 = butter_fly(u0, u1, cospi_p16_p16);
+
+ u0 = _mm256_unpacklo_epi16(in[5], in[10]);
+ u1 = _mm256_unpackhi_epi16(in[5], in[10]);
+ y5 = butter_fly(u0, u1, cospi_m16_p16);
+ y10 = butter_fly(u0, u1, cospi_p16_p16);
+
+ u0 = _mm256_unpacklo_epi16(in[6], in[9]);
+ u1 = _mm256_unpackhi_epi16(in[6], in[9]);
+ y6 = butter_fly(u0, u1, cospi_m16_p16);
+ y9 = butter_fly(u0, u1, cospi_p16_p16);
+
+ u0 = _mm256_unpacklo_epi16(in[7], in[8]);
+ u1 = _mm256_unpackhi_epi16(in[7], in[8]);
+ y7 = butter_fly(u0, u1, cospi_m16_p16);
+ y8 = butter_fly(u0, u1, cospi_p16_p16);
+
+ y12 = in[12];
+ y13 = in[13];
+ y14 = in[14];
+ y15 = in[15];
+
+ // stage 3
+ x0 = _mm256_add_epi16(y0, y7);
+ x1 = _mm256_add_epi16(y1, y6);
+ x2 = _mm256_add_epi16(y2, y5);
+ x3 = _mm256_add_epi16(y3, y4);
+ x4 = _mm256_sub_epi16(y3, y4);
+ x5 = _mm256_sub_epi16(y2, y5);
+ x6 = _mm256_sub_epi16(y1, y6);
+ x7 = _mm256_sub_epi16(y0, y7);
+ x8 = _mm256_sub_epi16(y15, y8);
+ x9 = _mm256_sub_epi16(y14, y9);
+ x10 = _mm256_sub_epi16(y13, y10);
+ x11 = _mm256_sub_epi16(y12, y11);
+ x12 = _mm256_add_epi16(y12, y11);
+ x13 = _mm256_add_epi16(y13, y10);
+ x14 = _mm256_add_epi16(y14, y9);
+ x15 = _mm256_add_epi16(y15, y8);
+
+ // stage 4
+ y0 = x0;
+ y1 = x1;
+ y6 = x6;
+ y7 = x7;
+ y8 = x8;
+ y9 = x9;
+ y14 = x14;
+ y15 = x15;
+
+ u0 = _mm256_unpacklo_epi16(x2, x13);
+ u1 = _mm256_unpackhi_epi16(x2, x13);
+ y2 = butter_fly(u0, u1, cospi_m08_p24);
+ y13 = butter_fly(u0, u1, cospi_p24_p08);
+
+ u0 = _mm256_unpacklo_epi16(x3, x12);
+ u1 = _mm256_unpackhi_epi16(x3, x12);
+ y3 = butter_fly(u0, u1, cospi_m08_p24);
+ y12 = butter_fly(u0, u1, cospi_p24_p08);
+
+ u0 = _mm256_unpacklo_epi16(x4, x11);
+ u1 = _mm256_unpackhi_epi16(x4, x11);
+ y4 = butter_fly(u0, u1, cospi_m24_m08);
+ y11 = butter_fly(u0, u1, cospi_m08_p24);
+
+ u0 = _mm256_unpacklo_epi16(x5, x10);
+ u1 = _mm256_unpackhi_epi16(x5, x10);
+ y5 = butter_fly(u0, u1, cospi_m24_m08);
+ y10 = butter_fly(u0, u1, cospi_m08_p24);
+
+ // stage 5
+ x0 = _mm256_add_epi16(y0, y3);
+ x1 = _mm256_add_epi16(y1, y2);
+ x2 = _mm256_sub_epi16(y1, y2);
+ x3 = _mm256_sub_epi16(y0, y3);
+ x4 = _mm256_sub_epi16(y7, y4);
+ x5 = _mm256_sub_epi16(y6, y5);
+ x6 = _mm256_add_epi16(y6, y5);
+ x7 = _mm256_add_epi16(y7, y4);
+
+ x8 = _mm256_add_epi16(y8, y11);
+ x9 = _mm256_add_epi16(y9, y10);
+ x10 = _mm256_sub_epi16(y9, y10);
+ x11 = _mm256_sub_epi16(y8, y11);
+ x12 = _mm256_sub_epi16(y15, y12);
+ x13 = _mm256_sub_epi16(y14, y13);
+ x14 = _mm256_add_epi16(y14, y13);
+ x15 = _mm256_add_epi16(y15, y12);
+
+ // stage 6
+ y0 = x0;
+ y3 = x3;
+ y4 = x4;
+ y7 = x7;
+ y8 = x8;
+ y11 = x11;
+ y12 = x12;
+ y15 = x15;
+
+ u0 = _mm256_unpacklo_epi16(x1, x14);
+ u1 = _mm256_unpackhi_epi16(x1, x14);
+ y1 = butter_fly(u0, u1, cospi_m04_p28);
+ y14 = butter_fly(u0, u1, cospi_p28_p04);
+
+ u0 = _mm256_unpacklo_epi16(x2, x13);
+ u1 = _mm256_unpackhi_epi16(x2, x13);
+ y2 = butter_fly(u0, u1, cospi_m28_m04);
+ y13 = butter_fly(u0, u1, cospi_m04_p28);
+
+ u0 = _mm256_unpacklo_epi16(x5, x10);
+ u1 = _mm256_unpackhi_epi16(x5, x10);
+ y5 = butter_fly(u0, u1, cospi_m20_p12);
+ y10 = butter_fly(u0, u1, cospi_p12_p20);
+
+ u0 = _mm256_unpacklo_epi16(x6, x9);
+ u1 = _mm256_unpackhi_epi16(x6, x9);
+ y6 = butter_fly(u0, u1, cospi_m12_m20);
+ y9 = butter_fly(u0, u1, cospi_m20_p12);
+
+ // stage 7
+ x0 = _mm256_add_epi16(y0, y1);
+ x1 = _mm256_sub_epi16(y0, y1);
+ x2 = _mm256_sub_epi16(y3, y2);
+ x3 = _mm256_add_epi16(y3, y2);
+ x4 = _mm256_add_epi16(y4, y5);
+ x5 = _mm256_sub_epi16(y4, y5);
+ x6 = _mm256_sub_epi16(y7, y6);
+ x7 = _mm256_add_epi16(y7, y6);
+
+ x8 = _mm256_add_epi16(y8, y9);
+ x9 = _mm256_sub_epi16(y8, y9);
+ x10 = _mm256_sub_epi16(y11, y10);
+ x11 = _mm256_add_epi16(y11, y10);
+ x12 = _mm256_add_epi16(y12, y13);
+ x13 = _mm256_sub_epi16(y12, y13);
+ x14 = _mm256_sub_epi16(y15, y14);
+ x15 = _mm256_add_epi16(y15, y14);
+
+ // stage 8
+ u0 = _mm256_unpacklo_epi16(x0, x15);
+ u1 = _mm256_unpackhi_epi16(x0, x15);
+ in[0] = butter_fly(u0, u1, cospi_p31_p01);
+ in[15] = butter_fly(u0, u1, cospi_m01_p31);
+
+ u0 = _mm256_unpacklo_epi16(x1, x14);
+ u1 = _mm256_unpackhi_epi16(x1, x14);
+ in[1] = butter_fly(u0, u1, cospi_p15_p17);
+ in[14] = butter_fly(u0, u1, cospi_m17_p15);
+
+ u0 = _mm256_unpacklo_epi16(x2, x13);
+ u1 = _mm256_unpackhi_epi16(x2, x13);
+ in[2] = butter_fly(u0, u1, cospi_p23_p09);
+ in[13] = butter_fly(u0, u1, cospi_m09_p23);
+
+ u0 = _mm256_unpacklo_epi16(x3, x12);
+ u1 = _mm256_unpackhi_epi16(x3, x12);
+ in[3] = butter_fly(u0, u1, cospi_p07_p25);
+ in[12] = butter_fly(u0, u1, cospi_m25_p07);
+
+ u0 = _mm256_unpacklo_epi16(x4, x11);
+ u1 = _mm256_unpackhi_epi16(x4, x11);
+ in[4] = butter_fly(u0, u1, cospi_p27_p05);
+ in[11] = butter_fly(u0, u1, cospi_m05_p27);
+
+ u0 = _mm256_unpacklo_epi16(x5, x10);
+ u1 = _mm256_unpackhi_epi16(x5, x10);
+ in[5] = butter_fly(u0, u1, cospi_p11_p21);
+ in[10] = butter_fly(u0, u1, cospi_m21_p11);
+
+ u0 = _mm256_unpacklo_epi16(x6, x9);
+ u1 = _mm256_unpackhi_epi16(x6, x9);
+ in[6] = butter_fly(u0, u1, cospi_p19_p13);
+ in[9] = butter_fly(u0, u1, cospi_m13_p19);
+
+ u0 = _mm256_unpacklo_epi16(x7, x8);
+ u1 = _mm256_unpackhi_epi16(x7, x8);
+ in[7] = butter_fly(u0, u1, cospi_p03_p29);
+ in[8] = butter_fly(u0, u1, cospi_m29_p03);
+}
+
+static void fdct32_avx2(__m256i *in0, __m256i *in1) {
+ __m256i even0[16], even1[16], odd0[16], odd1[16];
+ prepare_16x16_even(in0, even0);
+ fdct16_avx2(even0);
+
+ prepare_16x16_odd(in0, odd0);
+ fdct16_odd_avx2(odd0);
+
+ prepare_16x16_even(in1, even1);
+ fdct16_avx2(even1);
+
+ prepare_16x16_odd(in1, odd1);
+ fdct16_odd_avx2(odd1);
+
+ collect_coeffs(even0, odd0, even1, odd1, in0, in1);
+
+ mm256_transpose_32x32(in0, in1);
+}
+#endif // CONFIG_EXT_TX
+
+static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1,
+ int stride, tran_low_t *output) {
+ int i = 0;
+ tran_low_t *coeff = output;
+ while (i < 32) {
+ _mm256_storeu_si256((__m256i *)coeff, in0[i]);
+ _mm256_storeu_si256((__m256i *)(coeff + 16), in1[i]);
+ coeff += stride;
+ i += 1;
+ }
+}
+
+#if CONFIG_EXT_TX
+static void fhalfright32_16col_avx2(__m256i *in) {
+ int i = 0;
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i sqrt2 = _mm256_set1_epi16(Sqrt2);
+ const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+ __m256i x0, x1;
+
+ while (i < 16) {
+ in[i] = _mm256_slli_epi16(in[i], 2);
+ x0 = _mm256_unpacklo_epi16(in[i + 16], zero);
+ x1 = _mm256_unpackhi_epi16(in[i + 16], zero);
+ x0 = _mm256_madd_epi16(x0, sqrt2);
+ x1 = _mm256_madd_epi16(x1, sqrt2);
+ x0 = _mm256_add_epi32(x0, dct_rounding);
+ x1 = _mm256_add_epi32(x1, dct_rounding);
+ x0 = _mm256_srai_epi32(x0, DCT_CONST_BITS);
+ x1 = _mm256_srai_epi32(x1, DCT_CONST_BITS);
+ in[i + 16] = _mm256_packs_epi32(x0, x1);
+ i += 1;
+ }
+ fdct16_avx2(&in[16]);
+}
+
+static void fhalfright32_avx2(__m256i *in0, __m256i *in1) {
+ fhalfright32_16col_avx2(in0);
+ fhalfright32_16col_avx2(in1);
+ mm256_vectors_swap(in0, &in0[16], 16);
+ mm256_vectors_swap(in1, &in1[16], 16);
+ mm256_transpose_32x32(in0, in1);
+}
+
+static void load_buffer_32x32(const int16_t *input, int stride, int flipud,
+ int fliplr, __m256i *in0, __m256i *in1) {
+ // Load 4 16x16 blocks
+ const int16_t *topL = input;
+ const int16_t *topR = input + 16;
+ const int16_t *botL = input + 16 * stride;
+ const int16_t *botR = input + 16 * stride + 16;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ // Swap left columns
+ tmp = topL;
+ topL = botL;
+ botL = tmp;
+ // Swap right columns
+ tmp = topR;
+ topR = botR;
+ botR = tmp;
+ }
+
+ if (fliplr) {
+ // Swap top rows
+ tmp = topL;
+ topL = topR;
+ topR = tmp;
+ // Swap bottom rows
+ tmp = botL;
+ botL = botR;
+ botR = tmp;
+ }
+
+ // load first 16 columns
+ load_buffer_16x16(topL, stride, flipud, fliplr, in0);
+ load_buffer_16x16(botL, stride, flipud, fliplr, in0 + 16);
+
+ // load second 16 columns
+ load_buffer_16x16(topR, stride, flipud, fliplr, in1);
+ load_buffer_16x16(botR, stride, flipud, fliplr, in1 + 16);
+}
+#endif // CONFIG_EXT_TX
+
+static void nr_right_shift_32x32_16col(__m256i *in) {
+ int i = 0;
+ const __m256i one = _mm256_set1_epi16(1);
+ __m256i sign;
+ while (i < 32) {
+ sign = _mm256_srai_epi16(in[i], 15);
+ in[i] = _mm256_add_epi16(in[i], one);
+ in[i] = _mm256_sub_epi16(in[i], sign);
+ in[i] = _mm256_srai_epi16(in[i], 2);
+ i += 1;
+ }
+}
+
+// Negative rounding
+static void nr_right_shift_32x32(__m256i *in0, __m256i *in1) {
+ nr_right_shift_32x32_16col(in0);
+ nr_right_shift_32x32_16col(in1);
+}
+
+#if CONFIG_EXT_TX
+static void pr_right_shift_32x32_16col(__m256i *in) {
+ int i = 0;
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i one = _mm256_set1_epi16(1);
+ __m256i sign;
+ while (i < 32) {
+ sign = _mm256_cmpgt_epi16(in[i], zero);
+ in[i] = _mm256_add_epi16(in[i], one);
+ in[i] = _mm256_sub_epi16(in[i], sign);
+ in[i] = _mm256_srai_epi16(in[i], 2);
+ i += 1;
+ }
+}
+
+// Positive rounding
+static void pr_right_shift_32x32(__m256i *in0, __m256i *in1) {
+ pr_right_shift_32x32_16col(in0);
+ pr_right_shift_32x32_16col(in1);
+}
+
+static void fidtx32_avx2(__m256i *in0, __m256i *in1) {
+ int i = 0;
+ while (i < 32) {
+ in0[i] = _mm256_slli_epi16(in0[i], 2);
+ in1[i] = _mm256_slli_epi16(in1[i], 2);
+ i += 1;
+ }
+ mm256_transpose_32x32(in0, in1);
+}
+#endif
+
+void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ __m256i in0[32]; // left 32 columns
+ __m256i in1[32]; // right 32 columns
+ (void)input;
+ (void)stride;
+
+ switch (tx_type) {
+// TODO(luoyi): For DCT_DCT, fwd_txfm_32x32() uses aom set. But this
+// function has better speed. The replacement must work with the
+// corresponding inverse transform.
+// case DCT_DCT:
+// load_buffer_32x32(input, stride, 0, 0, in0, in1);
+// fdct32_avx2(in0, in1);
+// pr_right_shift_32x32(in0, in1);
+// fdct32_avx2(in0, in1);
+// break;
+#if CONFIG_EXT_TX
+ case ADST_DCT:
+ load_buffer_32x32(input, stride, 0, 0, in0, in1);
+ fhalfright32_avx2(in0, in1);
+ pr_right_shift_32x32(in0, in1);
+ fdct32_avx2(in0, in1);
+ break;
+ case DCT_ADST:
+ load_buffer_32x32(input, stride, 0, 0, in0, in1);
+ fdct32_avx2(in0, in1);
+ pr_right_shift_32x32(in0, in1);
+ fhalfright32_avx2(in0, in1);
+ break;
+ case ADST_ADST:
+ load_buffer_32x32(input, stride, 0, 0, in0, in1);
+ fhalfright32_avx2(in0, in1);
+ pr_right_shift_32x32(in0, in1);
+ fhalfright32_avx2(in0, in1);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_32x32(input, stride, 1, 0, in0, in1);
+ fhalfright32_avx2(in0, in1);
+ pr_right_shift_32x32(in0, in1);
+ fdct32_avx2(in0, in1);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_32x32(input, stride, 0, 1, in0, in1);
+ fdct32_avx2(in0, in1);
+ pr_right_shift_32x32(in0, in1);
+ fhalfright32_avx2(in0, in1);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_32x32(input, stride, 1, 1, in0, in1);
+ fhalfright32_avx2(in0, in1);
+ pr_right_shift_32x32(in0, in1);
+ fhalfright32_avx2(in0, in1);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_32x32(input, stride, 0, 1, in0, in1);
+ fhalfright32_avx2(in0, in1);
+ pr_right_shift_32x32(in0, in1);
+ fhalfright32_avx2(in0, in1);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_32x32(input, stride, 1, 0, in0, in1);
+ fhalfright32_avx2(in0, in1);
+ pr_right_shift_32x32(in0, in1);
+ fhalfright32_avx2(in0, in1);
+ break;
+ case V_DCT:
+ load_buffer_32x32(input, stride, 0, 0, in0, in1);
+ fdct32_avx2(in0, in1);
+ pr_right_shift_32x32(in0, in1);
+ fidtx32_avx2(in0, in1);
+ break;
+ case H_DCT:
+ load_buffer_32x32(input, stride, 0, 0, in0, in1);
+ fidtx32_avx2(in0, in1);
+ pr_right_shift_32x32(in0, in1);
+ fdct32_avx2(in0, in1);
+ break;
+ case V_ADST:
+ load_buffer_32x32(input, stride, 0, 0, in0, in1);
+ fhalfright32_avx2(in0, in1);
+ pr_right_shift_32x32(in0, in1);
+ fidtx32_avx2(in0, in1);
+ break;
+ case H_ADST:
+ load_buffer_32x32(input, stride, 0, 0, in0, in1);
+ fidtx32_avx2(in0, in1);
+ pr_right_shift_32x32(in0, in1);
+ fhalfright32_avx2(in0, in1);
+ break;
+ case V_FLIPADST:
+ load_buffer_32x32(input, stride, 1, 0, in0, in1);
+ fhalfright32_avx2(in0, in1);
+ pr_right_shift_32x32(in0, in1);
+ fidtx32_avx2(in0, in1);
+ break;
+ case H_FLIPADST:
+ load_buffer_32x32(input, stride, 0, 1, in0, in1);
+ fidtx32_avx2(in0, in1);
+ pr_right_shift_32x32(in0, in1);
+ fhalfright32_avx2(in0, in1);
+ break;
+#endif // CONFIG_EXT_TX
+ default: assert(0); break;
+ }
+ nr_right_shift_32x32(in0, in1);
+ write_buffer_32x32(in0, in1, 32, output);
+ _mm256_zeroupper();
}
diff --git a/configure b/configure
index 231909b..0e33876 100755
--- a/configure
+++ b/configure
@@ -606,6 +606,7 @@
check_add_cflags -Wimplicit-function-declaration
check_add_cflags -Wuninitialized
check_add_cflags -Wunused-variable
+ check_add_cflags -Wsign-compare
case ${CC} in
*clang*) ;;
*) check_add_cflags -Wunused-but-set-variable ;;
diff --git a/examples/aom_cx_set_ref.c b/examples/aom_cx_set_ref.c
index 43e8fe0..fdb9739 100644
--- a/examples/aom_cx_set_ref.c
+++ b/examples/aom_cx_set_ref.c
@@ -307,6 +307,7 @@
const char *height_arg = NULL;
const char *infile_arg = NULL;
const char *outfile_arg = NULL;
+ const char *update_frame_num_arg = NULL;
unsigned int limit = 0;
exec_name = argv[0];
@@ -317,18 +318,21 @@
height_arg = argv[3];
infile_arg = argv[4];
outfile_arg = argv[5];
+ update_frame_num_arg = argv[6];
encoder = get_aom_encoder_by_name(codec_arg);
if (!encoder) die("Unsupported codec.");
- update_frame_num = atoi(argv[6]);
+ update_frame_num = (unsigned int)strtoul(update_frame_num_arg, NULL, 0);
// In AV1, the reference buffers (cm->buffer_pool->frame_bufs[i].buf) are
// allocated while calling aom_codec_encode(), thus, setting reference for
// 1st frame isn't supported.
- if (update_frame_num <= 1) die("Couldn't parse frame number '%s'\n", argv[6]);
+ if (update_frame_num <= 1) {
+ die("Couldn't parse frame number '%s'\n", update_frame_num_arg);
+ }
if (argc > 7) {
- limit = atoi(argv[7]);
+ limit = (unsigned int)strtoul(argv[7], NULL, 0);
if (update_frame_num > limit)
die("Update frame number couldn't larger than limit\n");
}
diff --git a/test/ans_test.cc b/test/ans_test.cc
index ca38de2..ba8e3c7 100644
--- a/test/ans_test.cc
+++ b/test/ans_test.cc
@@ -74,18 +74,21 @@
return ans_read_end(&d);
}
-// TODO(aconverse@google.com): replace this with a more representative
-// distribution from the codec.
-const rans_sym rans_sym_tab[] = {
- { 67, 0 }, { 99, 67 }, { 575, 166 }, { 283, 741 },
-};
+const aom_cdf_prob spareto65[] = { 260, 188, 138, 102, 133, 122, 64, 15, 1, 1 };
-std::vector<int> ans_encode_build_vals(const rans_sym *tab, int iters) {
+const int kRansSymbols =
+ static_cast<int>(sizeof(spareto65) / sizeof(spareto65[0]));
+
+std::vector<int> ans_encode_build_vals(rans_sym *const tab, int iters) {
+ aom_cdf_prob sum = 0;
+ for (int i = 0; i < kRansSymbols; ++i) {
+ tab[i].cum_prob = sum;
+ tab[i].prob = spareto65[i];
+ sum += spareto65[i];
+ }
std::vector<int> p_to_sym;
- int i = 0;
- while (p_to_sym.size() < RANS_PRECISION) {
+ for (int i = 0; i < kRansSymbols; ++i) {
p_to_sym.insert(p_to_sym.end(), tab[i].prob, i);
- ++i;
}
assert(p_to_sym.size() == RANS_PRECISION);
std::vector<int> ret;
@@ -97,10 +100,11 @@
return ret;
}
-void rans_build_dec_tab(const struct rans_sym sym_tab[], rans_lut dec_tab) {
- dec_tab[0] = 0;
- for (int i = 1; dec_tab[i - 1] < RANS_PRECISION; ++i) {
- dec_tab[i] = dec_tab[i - 1] + sym_tab[i - 1].prob;
+void rans_build_dec_tab(const struct rans_sym sym_tab[],
+ aom_cdf_prob *dec_tab) {
+ unsigned int sum = 0;
+ for (int i = 0; sum < RANS_PRECISION; ++i) {
+ dec_tab[i] = sum += sym_tab[i].prob;
}
}
@@ -108,7 +112,7 @@
uint8_t *buf) {
AnsCoder a;
ans_write_init(&a, buf);
- rans_lut dec_tab;
+ aom_cdf_prob dec_tab[kRansSymbols];
rans_build_dec_tab(tab, dec_tab);
std::clock_t start = std::clock();
@@ -149,16 +153,20 @@
class AnsTest : public ::testing::Test {
protected:
static void SetUpTestCase() {
- sym_vec_ = ans_encode_build_vals(rans_sym_tab, kNumSyms);
+ sym_vec_ = ans_encode_build_vals(rans_sym_tab_, kNumSyms);
}
virtual void SetUp() { buf_ = new uint8_t[kNumSyms / 2]; }
virtual void TearDown() { delete[] buf_; }
static const int kNumSyms = 25000000;
static std::vector<int> sym_vec_;
+ static rans_sym rans_sym_tab_[kRansSymbols];
uint8_t *buf_;
};
std::vector<int> AnsTest::sym_vec_;
+rans_sym AnsTest::rans_sym_tab_[kRansSymbols];
TEST_F(AbsTest, Uabs) { EXPECT_TRUE(check_uabs(pv_vec_, buf_)); }
-TEST_F(AnsTest, Rans) { EXPECT_TRUE(check_rans(sym_vec_, rans_sym_tab, buf_)); }
+TEST_F(AnsTest, Rans) {
+ EXPECT_TRUE(check_rans(sym_vec_, rans_sym_tab_, buf_));
+}
} // namespace
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 9a661f9..e4179ef 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -402,6 +402,12 @@
AOM_BITS_8)));
#endif // HAVE_SSE2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_AVX2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(AVX2, PartialTrans32x32Test,
+ ::testing::Values(make_tuple(&aom_fdct32x32_1_avx2,
+ AOM_BITS_8)));
+#endif // HAVE_AVX2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
#if HAVE_SSE2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
SSE2, Trans32x32Test,
diff --git a/test/fht32x32_test.cc b/test/fht32x32_test.cc
new file mode 100644
index 0000000..a949ebf
--- /dev/null
+++ b/test/fht32x32_test.cc
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./av1_rtcd.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+#include "aom_ports/mem.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+ int tx_type);
+using std::tr1::tuple;
+using libaom_test::FhtFunc;
+typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht32x32Param;
+
+void fht32x32_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+ av1_fht32x32_c(in, out, stride, tx_type);
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+ int tx_type, int bd);
+typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
+ int tx_type, int bd);
+
+// Target optimized function, tx_type, bit depth
+typedef tuple<HbdHtFunc, int, int> HighbdHt32x32Param;
+
+void highbd_fht32x32_ref(const int16_t *in, int32_t *out, int stride,
+ int tx_type, int bd) {
+ av1_fwd_txfm2d_32x32_c(in, out, stride, tx_type, bd);
+}
+#endif // CONFIG_AOM_HIGHBITDEPTH
+
+#if HAVE_AVX2
+void dummy_inv_txfm(const tran_low_t *in, uint8_t *out, int stride,
+ int tx_type) {
+ (void)in;
+ (void)out;
+ (void)stride;
+ (void)tx_type;
+}
+#endif
+
+class AV1Trans32x32HT : public libaom_test::TransformTestBase,
+ public ::testing::TestWithParam<Ht32x32Param> {
+ public:
+ virtual ~AV1Trans32x32HT() {}
+
+ virtual void SetUp() {
+ fwd_txfm_ = GET_PARAM(0);
+ inv_txfm_ = GET_PARAM(1);
+ tx_type_ = GET_PARAM(2);
+ pitch_ = 32;
+ fwd_txfm_ref = fht32x32_ref;
+ bit_depth_ = GET_PARAM(3);
+ mask_ = (1 << bit_depth_) - 1;
+ num_coeffs_ = GET_PARAM(4);
+ }
+ virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+ void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+ fwd_txfm_(in, out, stride, tx_type_);
+ }
+
+ void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+ inv_txfm_(out, dst, stride, tx_type_);
+ }
+
+ FhtFunc fwd_txfm_;
+ IhtFunc inv_txfm_;
+};
+
+TEST_P(AV1Trans32x32HT, CoeffCheck) { RunCoeffCheck(); }
+
+#if CONFIG_AOM_HIGHBITDEPTH
+class AV1HighbdTrans32x32HT
+ : public ::testing::TestWithParam<HighbdHt32x32Param> {
+ public:
+ virtual ~AV1HighbdTrans32x32HT() {}
+
+ virtual void SetUp() {
+ fwd_txfm_ = GET_PARAM(0);
+ fwd_txfm_ref_ = highbd_fht32x32_ref;
+ tx_type_ = GET_PARAM(1);
+ bit_depth_ = GET_PARAM(2);
+ mask_ = (1 << bit_depth_) - 1;
+ num_coeffs_ = 1024;
+
+ input_ = reinterpret_cast<int16_t *>(
+ aom_memalign(32, sizeof(int16_t) * num_coeffs_));
+ output_ = reinterpret_cast<int32_t *>(
+ aom_memalign(32, sizeof(int32_t) * num_coeffs_));
+ output_ref_ = reinterpret_cast<int32_t *>(
+ aom_memalign(32, sizeof(int32_t) * num_coeffs_));
+ }
+
+ virtual void TearDown() {
+ aom_free(input_);
+ aom_free(output_);
+ aom_free(output_ref_);
+ libaom_test::ClearSystemState();
+ }
+
+ protected:
+ void RunBitexactCheck();
+
+ private:
+ HbdHtFunc fwd_txfm_;
+ HbdHtFunc fwd_txfm_ref_;
+ int tx_type_;
+ int bit_depth_;
+ int mask_;
+ int num_coeffs_;
+ int16_t *input_;
+ int32_t *output_;
+ int32_t *output_ref_;
+};
+
+void AV1HighbdTrans32x32HT::RunBitexactCheck() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ int i, j;
+ const int stride = 32;
+ const int num_tests = 1000;
+
+ for (i = 0; i < num_tests; ++i) {
+ for (j = 0; j < num_coeffs_; ++j) {
+ input_[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+ }
+
+ fwd_txfm_ref_(input_, output_ref_, stride, tx_type_, bit_depth_);
+ ASM_REGISTER_STATE_CHECK(
+ fwd_txfm_(input_, output_, stride, tx_type_, bit_depth_));
+
+ for (j = 0; j < num_coeffs_; ++j) {
+ EXPECT_EQ(output_ref_[j], output_[j])
+ << "Not bit-exact result at index: " << j << " at test block: " << i;
+ }
+ }
+}
+
+TEST_P(AV1HighbdTrans32x32HT, HighbdCoeffCheck) { RunBitexactCheck(); }
+#endif // CONFIG_AOM_HIGHBITDEPTH
+
+using std::tr1::make_tuple;
+
+#if HAVE_AVX2
+const Ht32x32Param kArrayHt32x32Param_avx2[] = {
+ // TODO(luoyi): DCT_DCT tx_type is not enabled in av1_fht32x32_c(avx2) yet.
+ // make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 0, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 1, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 2, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 3, AOM_BITS_8, 1024),
+#if CONFIG_EXT_TX
+ make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 4, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 5, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 6, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 7, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 8, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 10, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 11, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 12, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 13, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 14, AOM_BITS_8, 1024),
+ make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 15, AOM_BITS_8, 1024)
+#endif // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(AVX2, AV1Trans32x32HT,
+ ::testing::ValuesIn(kArrayHt32x32Param_avx2));
+#endif // HAVE_AVX2
+} // namespace
diff --git a/test/test.mk b/test/test.mk
index c0573e7..f149fc5 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -144,6 +144,7 @@
LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht16x8_test.cc
LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_iht8x16_test.cc
LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_iht16x8_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += fht32x32_test.cc
endif
LIBAOM_TEST_SRCS-$(CONFIG_EXT_TILE) += av1_ext_tile_test.cc
diff --git a/tools/gen_authors.sh b/tools/gen_authors.sh
index 4cfd81e..5def8bc 100755
--- a/tools/gen_authors.sh
+++ b/tools/gen_authors.sh
@@ -6,8 +6,5 @@
# This file is automatically generated from the git commit history
# by tools/gen_authors.sh.
-$(git log --pretty=format:"%aN <%aE>" | sort | uniq | grep -v corp.google)
-Google Inc.
-The Mozilla Foundation
-The Xiph.Org Foundation
+$(git log --pretty=format:"%aN <%aE>" | sort | uniq | grep -v "corp.google\|clang-format")
EOF
diff --git a/tools/gen_constrained_tokenset.py b/tools/gen_constrained_tokenset.py
new file mode 100755
index 0000000..a0f8280
--- /dev/null
+++ b/tools/gen_constrained_tokenset.py
@@ -0,0 +1,115 @@
+#!/usr/bin/python
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Generate the probability model for the constrained token set.
+
+Model obtained from a 2-sided zero-centered distribution derived
+from a Pareto distribution. The cdf of the distribution is:
+cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
+
+For a given beta and a given probability of the 1-node, the alpha
+is first solved, and then the {alpha, beta} pair is used to generate
+the probabilities for the rest of the nodes.
+"""
+
+import heapq
+import sys
+import numpy as np
+import scipy.optimize
+import scipy.stats
+
+
+def cdf_spareto(x, xm, beta):
+ p = 1 - (xm / (np.abs(x) + xm))**beta
+ p = 0.5 + 0.5 * np.sign(x) * p
+ return p
+
+
+def get_spareto(p, beta):
+ cdf = cdf_spareto
+
+ def func(x):
+ return ((cdf(1.5, x, beta) - cdf(0.5, x, beta)) /
+ (1 - cdf(0.5, x, beta)) - p)**2
+
+ alpha = scipy.optimize.fminbound(func, 1e-12, 10000, xtol=1e-12)
+ parray = np.zeros(11)
+ parray[0] = 2 * (cdf(0.5, alpha, beta) - 0.5)
+ parray[1] = (2 * (cdf(1.5, alpha, beta) - cdf(0.5, alpha, beta)))
+ parray[2] = (2 * (cdf(2.5, alpha, beta) - cdf(1.5, alpha, beta)))
+ parray[3] = (2 * (cdf(3.5, alpha, beta) - cdf(2.5, alpha, beta)))
+ parray[4] = (2 * (cdf(4.5, alpha, beta) - cdf(3.5, alpha, beta)))
+ parray[5] = (2 * (cdf(6.5, alpha, beta) - cdf(4.5, alpha, beta)))
+ parray[6] = (2 * (cdf(10.5, alpha, beta) - cdf(6.5, alpha, beta)))
+ parray[7] = (2 * (cdf(18.5, alpha, beta) - cdf(10.5, alpha, beta)))
+ parray[8] = (2 * (cdf(34.5, alpha, beta) - cdf(18.5, alpha, beta)))
+ parray[9] = (2 * (cdf(66.5, alpha, beta) - cdf(34.5, alpha, beta)))
+ parray[10] = 2 * (1. - cdf(66.5, alpha, beta))
+ return parray
+
+
+def quantize_probs(p, save_first_bin, bits):
+ """Quantize probability precisely.
+
+ Quantize probabilities minimizing dH (Kullback-Leibler divergence)
+ approximated by: sum (p_i-q_i)^2/p_i.
+ References:
+ https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
+ https://github.com/JarekDuda/AsymmetricNumeralSystemsToolkit
+ """
+ num_sym = p.size
+ p = np.clip(p, 1e-16, 1)
+ L = 2**bits
+ pL = p * L
+ ip = 1. / p # inverse probability
+ q = np.clip(np.round(pL), 1, L + 1 - num_sym)
+ quant_err = (pL - q)**2 * ip
+ sgn = np.sign(L - q.sum()) # direction of correction
+ if sgn != 0: # correction is needed
+ v = [] # heap of adjustment results (adjustment err, index) of each symbol
+ for i in range(1 if save_first_bin else 0, num_sym):
+ q_adj = q[i] + sgn
+ if q_adj > 0 and q_adj < L:
+ adj_err = (pL[i] - q_adj)**2 * ip[i] - quant_err[i]
+ heapq.heappush(v, (adj_err, i))
+ while q.sum() != L:
+ # apply lowest error adjustment
+ (adj_err, i) = heapq.heappop(v)
+ quant_err[i] += adj_err
+ q[i] += sgn
+ # calculate the cost of adjusting this symbol again
+ q_adj = q[i] + sgn
+ if q_adj > 0 and q_adj < L:
+ adj_err = (pL[i] - q_adj)**2 * ip[i] - quant_err[i]
+ heapq.heappush(v, (adj_err, i))
+ return q
+
+
+def get_quantized_spareto(p, beta, bits):
+ parray = get_spareto(p, beta)
+ parray = parray[1:] / (1 - parray[0])
+ qarray = quantize_probs(parray, True, bits)
+ return qarray.astype(np.int)
+
+
+def main(bits=8):
+ beta = 8
+ for q in range(1, 256):
+ parray = get_quantized_spareto(q / 256., beta, bits)
+ assert parray.sum() == 2**bits
+ print '{', ', '.join('%d' % i for i in parray), '},'
+
+
+if __name__ == '__main__':
+ if len(sys.argv) > 1:
+ main(int(sys.argv[1]))
+ else:
+ main()