| /* |
| * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
| * |
| * This source code is subject to the terms of the BSD 2 Clause License and |
| * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| * was not distributed with this source code in the LICENSE file, you can |
| * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| * Media Patent License 1.0 was not distributed with this source code in the |
| * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| */ |
| |
| #include <assert.h> |
| #include <math.h> |
| #include <stdbool.h> |
| |
| #include "config/aom_dsp_rtcd.h" |
| #include "config/av1_rtcd.h" |
| |
| #include "aom_dsp/aom_dsp_common.h" |
| #include "aom_dsp/blend.h" |
| #include "aom_mem/aom_mem.h" |
| #include "aom_ports/aom_timer.h" |
| #include "aom_ports/mem.h" |
| #include "aom_ports/system_state.h" |
| |
| #include "av1/common/cfl.h" |
| #include "av1/common/common.h" |
| #include "av1/common/common_data.h" |
| #include "av1/common/entropy.h" |
| #include "av1/common/entropymode.h" |
| #include "av1/common/idct.h" |
| #include "av1/common/mvref_common.h" |
| #include "av1/common/obmc.h" |
| #include "av1/common/onyxc_int.h" |
| #include "av1/common/pred_common.h" |
| #include "av1/common/quant_common.h" |
| #include "av1/common/reconinter.h" |
| #include "av1/common/reconintra.h" |
| #include "av1/common/scan.h" |
| #include "av1/common/seg_common.h" |
| #include "av1/common/txb_common.h" |
| #include "av1/common/warped_motion.h" |
| |
| #include "av1/encoder/aq_variance.h" |
| #include "av1/encoder/av1_quantize.h" |
| #include "av1/encoder/cost.h" |
| #include "av1/encoder/encodemb.h" |
| #include "av1/encoder/encodemv.h" |
| #include "av1/encoder/encoder.h" |
| #include "av1/encoder/encodetxb.h" |
| #include "av1/encoder/hybrid_fwd_txfm.h" |
| #include "av1/encoder/mcomp.h" |
| #include "av1/encoder/ml.h" |
| #include "av1/encoder/palette.h" |
| #include "av1/encoder/pustats.h" |
| #include "av1/encoder/random.h" |
| #include "av1/encoder/ratectrl.h" |
| #include "av1/encoder/rd.h" |
| #include "av1/encoder/rdopt.h" |
| #include "av1/encoder/reconinter_enc.h" |
| #include "av1/encoder/tokenize.h" |
| #include "av1/encoder/tx_prune_model_weights.h" |
| |
| // Set this macro as 1 to collect data about tx size selection. |
| #define COLLECT_TX_SIZE_DATA 0 |
| |
| #if COLLECT_TX_SIZE_DATA |
| static const char av1_tx_size_data_output_file[] = "tx_size_data.txt"; |
| #endif |
| |
| typedef void (*model_rd_for_sb_type)( |
| const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, |
| int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, |
| int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, |
| int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); |
| typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi, |
| const MACROBLOCK *const x, |
| BLOCK_SIZE plane_bsize, int plane, |
| int64_t sse, int num_samples, int *rate, |
| int64_t *dist); |
| |
| static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, |
| MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, |
| int plane_to, int mi_row, int mi_col, |
| int *out_rate_sum, int64_t *out_dist_sum, |
| int *skip_txfm_sb, int64_t *skip_sse_sb, |
| int *plane_rate, int64_t *plane_sse, |
| int64_t *plane_dist); |
| static void model_rd_for_sb_with_curvfit( |
| const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, |
| int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, |
| int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, |
| int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); |
| static void model_rd_for_sb_with_surffit( |
| const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, |
| int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, |
| int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, |
| int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); |
| static void model_rd_for_sb_with_dnn( |
| const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, |
| int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, |
| int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, |
| int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); |
| static void model_rd_for_sb_with_fullrdy( |
| const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, |
| int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, |
| int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, |
| int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); |
| static void model_rd_from_sse(const AV1_COMP *const cpi, |
| const MACROBLOCK *const x, BLOCK_SIZE plane_bsize, |
| int plane, int64_t sse, int num_samples, |
| int *rate, int64_t *dist); |
| static void model_rd_with_dnn(const AV1_COMP *const cpi, |
| const MACROBLOCK *const x, BLOCK_SIZE plane_bsize, |
| int plane, int64_t sse, int num_samples, |
| int *rate, int64_t *dist); |
| static void model_rd_with_curvfit(const AV1_COMP *const cpi, |
| const MACROBLOCK *const x, |
| BLOCK_SIZE plane_bsize, int plane, |
| int64_t sse, int num_samples, int *rate, |
| int64_t *dist); |
| static void model_rd_with_surffit(const AV1_COMP *const cpi, |
| const MACROBLOCK *const x, |
| BLOCK_SIZE plane_bsize, int plane, |
| int64_t sse, int num_samples, int *rate, |
| int64_t *dist); |
| |
| enum { |
| MODELRD_LEGACY, |
| MODELRD_CURVFIT, |
| MODELRD_SUFFIT, |
| MODELRD_DNN, |
| MODELRD_FULLRDY, |
| MODELRD_TYPES |
| } UENUM1BYTE(ModelRdType); |
| |
| static model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = { |
| model_rd_for_sb, model_rd_for_sb_with_curvfit, model_rd_for_sb_with_surffit, |
| model_rd_for_sb_with_dnn, model_rd_for_sb_with_fullrdy |
| }; |
| |
| static model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = { |
| model_rd_from_sse, model_rd_with_curvfit, model_rd_with_surffit, |
| model_rd_with_dnn, NULL |
| }; |
| |
| // 0: Legacy model |
| // 1: Curve fit model |
| // 2: Surface fit model |
| // 3: DNN regression model |
| // 4: Full rd model |
| #define MODELRD_TYPE_INTERP_FILTER 1 |
| #define MODELRD_TYPE_TX_SEARCH_PRUNE 1 |
| #define MODELRD_TYPE_MASKED_COMPOUND 1 |
| #define MODELRD_TYPE_INTERINTRA 1 |
| #define MODELRD_TYPE_INTRA 1 |
| #define MODELRD_TYPE_DIST_WTD_COMPOUND 1 |
| #define MODELRD_TYPE_MOTION_MODE_RD 1 |
| |
| #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS) |
| static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = { |
| 0x00000000, 0x00010000, 0x00020000, // y = 0 |
| 0x00000001, 0x00010001, 0x00020001, // y = 1 |
| 0x00000002, 0x00010002, 0x00020002, // y = 2 |
| }; |
| |
| static const double ADST_FLIP_SVM[8] = { |
| /* vertical */ |
| -6.6623, -2.8062, -3.2531, 3.1671, |
| /* horizontal */ |
| -7.7051, -3.2234, -3.6193, 3.4533 |
| }; |
| |
| typedef struct { |
| PREDICTION_MODE mode; |
| MV_REFERENCE_FRAME ref_frame[2]; |
| } MODE_DEFINITION; |
| |
| enum { |
| FTXS_NONE = 0, |
| FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0, |
| FTXS_DISABLE_TRELLIS_OPT = 1 << 1, |
| FTXS_USE_TRANSFORM_DOMAIN = 1 << 2 |
| } UENUM1BYTE(FAST_TX_SEARCH_MODE); |
| |
| struct rdcost_block_args { |
| const AV1_COMP *cpi; |
| MACROBLOCK *x; |
| ENTROPY_CONTEXT t_above[MAX_MIB_SIZE]; |
| ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]; |
| RD_STATS rd_stats; |
| int64_t this_rd; |
| int64_t best_rd; |
| int exit_early; |
| int incomplete_exit; |
| int use_fast_coef_costing; |
| FAST_TX_SEARCH_MODE ftxs_mode; |
| int skip_trellis; |
| }; |
| |
| #define LAST_NEW_MV_INDEX 6 |
| static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { |
| { NEARESTMV, { LAST_FRAME, NONE_FRAME } }, |
| { NEARESTMV, { LAST2_FRAME, NONE_FRAME } }, |
| { NEARESTMV, { LAST3_FRAME, NONE_FRAME } }, |
| { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } }, |
| { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } }, |
| { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } }, |
| { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } }, |
| |
| { NEWMV, { LAST_FRAME, NONE_FRAME } }, |
| { NEWMV, { LAST2_FRAME, NONE_FRAME } }, |
| { NEWMV, { LAST3_FRAME, NONE_FRAME } }, |
| { NEWMV, { BWDREF_FRAME, NONE_FRAME } }, |
| { NEWMV, { ALTREF2_FRAME, NONE_FRAME } }, |
| { NEWMV, { ALTREF_FRAME, NONE_FRAME } }, |
| { NEWMV, { GOLDEN_FRAME, NONE_FRAME } }, |
| |
| { NEARMV, { LAST_FRAME, NONE_FRAME } }, |
| { NEARMV, { LAST2_FRAME, NONE_FRAME } }, |
| { NEARMV, { LAST3_FRAME, NONE_FRAME } }, |
| { NEARMV, { BWDREF_FRAME, NONE_FRAME } }, |
| { NEARMV, { ALTREF2_FRAME, NONE_FRAME } }, |
| { NEARMV, { ALTREF_FRAME, NONE_FRAME } }, |
| { NEARMV, { GOLDEN_FRAME, NONE_FRAME } }, |
| |
| { GLOBALMV, { LAST_FRAME, NONE_FRAME } }, |
| { GLOBALMV, { LAST2_FRAME, NONE_FRAME } }, |
| { GLOBALMV, { LAST3_FRAME, NONE_FRAME } }, |
| { GLOBALMV, { BWDREF_FRAME, NONE_FRAME } }, |
| { GLOBALMV, { ALTREF2_FRAME, NONE_FRAME } }, |
| { GLOBALMV, { GOLDEN_FRAME, NONE_FRAME } }, |
| { GLOBALMV, { ALTREF_FRAME, NONE_FRAME } }, |
| |
| // TODO(zoeliu): May need to reconsider the order on the modes to check |
| |
| { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } }, |
| { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } }, |
| { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } }, |
| { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, |
| |
| { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } }, |
| { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, |
| { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, |
| { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, |
| { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } }, |
| { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } }, |
| { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, |
| { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } }, |
| { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } }, |
| { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, |
| { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } }, |
| |
| { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, |
| { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } }, |
| { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, |
| { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, |
| { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } }, |
| { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } }, |
| { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, |
| { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } }, |
| { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, |
| { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } }, |
| { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } }, |
| { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, |
| { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } }, |
| { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, |
| { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST2_FRAME, BWDREF_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } }, |
| { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } }, |
| { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, |
| { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } }, |
| { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, |
| { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST3_FRAME, BWDREF_FRAME } }, |
| |
| { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } }, |
| { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } }, |
| { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, |
| { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } }, |
| { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, |
| { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, |
| { GLOBAL_GLOBALMV, { GOLDEN_FRAME, BWDREF_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } }, |
| { NEAREST_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } }, |
| { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF2_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } }, |
| { NEAREST_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } }, |
| { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF2_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } }, |
| { NEAREST_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } }, |
| { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF2_FRAME } }, |
| |
| { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, |
| { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, |
| { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, |
| { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } }, |
| { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } }, |
| { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } }, |
| { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } }, |
| { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } }, |
| { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST_FRAME, LAST2_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } }, |
| { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } }, |
| { NEAREST_NEWMV, { LAST_FRAME, LAST3_FRAME } }, |
| { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } }, |
| { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } }, |
| { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST_FRAME, LAST3_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } }, |
| { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } }, |
| { NEAREST_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, |
| { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } }, |
| { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, |
| { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST_FRAME, GOLDEN_FRAME } }, |
| |
| { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } }, |
| { NEAREST_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } }, |
| { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, |
| { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, |
| { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } }, |
| |
| // intra modes |
| { DC_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { H_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { V_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { D135_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { D203_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { D157_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { D67_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { D113_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { D45_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| }; |
| |
| static const int16_t intra_to_mode_idx[INTRA_MODE_NUM] = { |
| THR_DC, // DC_PRED, |
| THR_V_PRED, // V_PRED, |
| THR_H_PRED, // H_PRED, |
| THR_D45_PRED, // D45_PRED, |
| THR_D135_PRED, // D135_PRED, |
| THR_D113_PRED, // D113_PRED, |
| THR_D157_PRED, // D157_PRED, |
| THR_D203_PRED, // D203_PRED, |
| THR_D67_PRED, // D67_PRED, |
| THR_SMOOTH, // SMOOTH_PRED, |
| THR_SMOOTH_V, // SMOOTH_V_PRED, |
| THR_SMOOTH_H, // SMOOTH_H_PRED, |
| THR_PAETH, // PAETH_PRED, |
| }; |
| |
| /* clang-format off */ |
| static const int16_t single_inter_to_mode_idx[SINGLE_INTER_MODE_NUM] |
| [REF_FRAMES] = { |
| // NEARESTMV, |
| { -1, THR_NEARESTMV, THR_NEARESTL2, THR_NEARESTL3, |
| THR_NEARESTG, THR_NEARESTB, THR_NEARESTA2, THR_NEARESTA, }, |
| // NEARMV, |
| { -1, THR_NEARMV, THR_NEARL2, THR_NEARL3, |
| THR_NEARG, THR_NEARB, THR_NEARA2, THR_NEARA, }, |
| // GLOBALMV, |
| { -1, THR_GLOBALMV, THR_GLOBALL2, THR_GLOBALL3, |
| THR_GLOBALG, THR_GLOBALB, THR_GLOBALA2, THR_GLOBALA, }, |
| // NEWMV, |
| { -1, THR_NEWMV, THR_NEWL2, THR_NEWL3, |
| THR_NEWG, THR_NEWB, THR_NEWA2, THR_NEWA, }, |
| }; |
| /* clang-format on */ |
| |
| /* clang-format off */ |
| static const int16_t comp_inter_to_mode_idx[COMP_INTER_MODE_NUM][REF_FRAMES] |
| [REF_FRAMES] = { |
| // NEAREST_NEARESTMV, |
| { |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, |
| THR_COMP_NEAREST_NEARESTLL2, THR_COMP_NEAREST_NEARESTLL3, |
| THR_COMP_NEAREST_NEARESTLG, THR_COMP_NEAREST_NEARESTLB, |
| THR_COMP_NEAREST_NEARESTLA2, THR_COMP_NEAREST_NEARESTLA, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEAREST_NEARESTL2B, |
| THR_COMP_NEAREST_NEARESTL2A2, THR_COMP_NEAREST_NEARESTL2A, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEAREST_NEARESTL3B, |
| THR_COMP_NEAREST_NEARESTL3A2, THR_COMP_NEAREST_NEARESTL3A, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEAREST_NEARESTGB, |
| THR_COMP_NEAREST_NEARESTGA2, THR_COMP_NEAREST_NEARESTGA, }, |
| { -1, -1, |
| -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEAREST_NEARESTBA, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| }, |
| // NEAR_NEARMV, |
| { |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, |
| THR_COMP_NEAR_NEARLL2, THR_COMP_NEAR_NEARLL3, |
| THR_COMP_NEAR_NEARLG, THR_COMP_NEAR_NEARLB, |
| THR_COMP_NEAR_NEARLA2, THR_COMP_NEAR_NEARLA, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEAR_NEARL2B, |
| THR_COMP_NEAR_NEARL2A2, THR_COMP_NEAR_NEARL2A, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEAR_NEARL3B, |
| THR_COMP_NEAR_NEARL3A2, THR_COMP_NEAR_NEARL3A, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEAR_NEARGB, |
| THR_COMP_NEAR_NEARGA2, THR_COMP_NEAR_NEARGA, }, |
| { -1, -1, |
| -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEAR_NEARBA, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| }, |
| // NEAREST_NEWMV, |
| { |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, |
| THR_COMP_NEAREST_NEWLL2, THR_COMP_NEAREST_NEWLL3, |
| THR_COMP_NEAREST_NEWLG, THR_COMP_NEAREST_NEWLB, |
| THR_COMP_NEAREST_NEWLA2, THR_COMP_NEAREST_NEWLA, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEAREST_NEWL2B, |
| THR_COMP_NEAREST_NEWL2A2, THR_COMP_NEAREST_NEWL2A, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEAREST_NEWL3B, |
| THR_COMP_NEAREST_NEWL3A2, THR_COMP_NEAREST_NEWL3A, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEAREST_NEWGB, |
| THR_COMP_NEAREST_NEWGA2, THR_COMP_NEAREST_NEWGA, }, |
| { -1, -1, |
| -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEAREST_NEWBA, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| }, |
| // NEW_NEARESTMV, |
| { |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, |
| THR_COMP_NEW_NEARESTLL2, THR_COMP_NEW_NEARESTLL3, |
| THR_COMP_NEW_NEARESTLG, THR_COMP_NEW_NEARESTLB, |
| THR_COMP_NEW_NEARESTLA2, THR_COMP_NEW_NEARESTLA, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEW_NEARESTL2B, |
| THR_COMP_NEW_NEARESTL2A2, THR_COMP_NEW_NEARESTL2A, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEW_NEARESTL3B, |
| THR_COMP_NEW_NEARESTL3A2, THR_COMP_NEW_NEARESTL3A, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEW_NEARESTGB, |
| THR_COMP_NEW_NEARESTGA2, THR_COMP_NEW_NEARESTGA, }, |
| { -1, -1, |
| -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEW_NEARESTBA, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| }, |
| // NEAR_NEWMV, |
| { |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, |
| THR_COMP_NEAR_NEWLL2, THR_COMP_NEAR_NEWLL3, |
| THR_COMP_NEAR_NEWLG, THR_COMP_NEAR_NEWLB, |
| THR_COMP_NEAR_NEWLA2, THR_COMP_NEAR_NEWLA, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEAR_NEWL2B, |
| THR_COMP_NEAR_NEWL2A2, THR_COMP_NEAR_NEWL2A, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEAR_NEWL3B, |
| THR_COMP_NEAR_NEWL3A2, THR_COMP_NEAR_NEWL3A, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEAR_NEWGB, |
| THR_COMP_NEAR_NEWGA2, THR_COMP_NEAR_NEWGA, }, |
| { -1, -1, |
| -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEAR_NEWBA, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| }, |
| // NEW_NEARMV, |
| { |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, |
| THR_COMP_NEW_NEARLL2, THR_COMP_NEW_NEARLL3, |
| THR_COMP_NEW_NEARLG, THR_COMP_NEW_NEARLB, |
| THR_COMP_NEW_NEARLA2, THR_COMP_NEW_NEARLA, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEW_NEARL2B, |
| THR_COMP_NEW_NEARL2A2, THR_COMP_NEW_NEARL2A, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEW_NEARL3B, |
| THR_COMP_NEW_NEARL3A2, THR_COMP_NEW_NEARL3A, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEW_NEARGB, |
| THR_COMP_NEW_NEARGA2, THR_COMP_NEW_NEARGA, }, |
| { -1, -1, |
| -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEW_NEARBA, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| }, |
| // GLOBAL_GLOBALMV, |
| { |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, |
| THR_COMP_GLOBAL_GLOBALLL2, THR_COMP_GLOBAL_GLOBALLL3, |
| THR_COMP_GLOBAL_GLOBALLG, THR_COMP_GLOBAL_GLOBALLB, |
| THR_COMP_GLOBAL_GLOBALLA2, THR_COMP_GLOBAL_GLOBALLA, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_GLOBAL_GLOBALL2B, |
| THR_COMP_GLOBAL_GLOBALL2A2, THR_COMP_GLOBAL_GLOBALL2A, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_GLOBAL_GLOBALL3B, |
| THR_COMP_GLOBAL_GLOBALL3A2, THR_COMP_GLOBAL_GLOBALL3A, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_GLOBAL_GLOBALGB, |
| THR_COMP_GLOBAL_GLOBALGA2, THR_COMP_GLOBAL_GLOBALGA, }, |
| { -1, -1, |
| -1, -1, |
| -1, -1, |
| -1, THR_COMP_GLOBAL_GLOBALBA, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| }, |
| // NEW_NEWMV, |
| { |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, |
| THR_COMP_NEW_NEWLL2, THR_COMP_NEW_NEWLL3, |
| THR_COMP_NEW_NEWLG, THR_COMP_NEW_NEWLB, |
| THR_COMP_NEW_NEWLA2, THR_COMP_NEW_NEWLA, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEW_NEWL2B, |
| THR_COMP_NEW_NEWL2A2, THR_COMP_NEW_NEWL2A, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEW_NEWL3B, |
| THR_COMP_NEW_NEWL3A2, THR_COMP_NEW_NEWL3A, }, |
| { -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEW_NEWGB, |
| THR_COMP_NEW_NEWGA2, THR_COMP_NEW_NEWGA, }, |
| { -1, -1, |
| -1, -1, |
| -1, -1, |
| -1, THR_COMP_NEW_NEWBA, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| }, |
| }; |
| /* clang-format on */ |
| |
| static int get_prediction_mode_idx(PREDICTION_MODE this_mode, |
| MV_REFERENCE_FRAME ref_frame, |
| MV_REFERENCE_FRAME second_ref_frame) { |
| if (this_mode < INTRA_MODE_END) { |
| assert(ref_frame == INTRA_FRAME); |
| assert(second_ref_frame == NONE_FRAME); |
| return intra_to_mode_idx[this_mode - INTRA_MODE_START]; |
| } |
| if (this_mode >= SINGLE_INTER_MODE_START && |
| this_mode < SINGLE_INTER_MODE_END) { |
| assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME)); |
| return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START] |
| [ref_frame]; |
| } |
| if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END) { |
| assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME)); |
| assert((second_ref_frame > INTRA_FRAME) && |
| (second_ref_frame <= ALTREF_FRAME)); |
| return comp_inter_to_mode_idx[this_mode - COMP_INTER_MODE_START][ref_frame] |
| [second_ref_frame]; |
| } |
| assert(0); |
| return -1; |
| } |
| |
| static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = { |
| DC_PRED, H_PRED, V_PRED, SMOOTH_PRED, PAETH_PRED, |
| SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED, D157_PRED, |
| D67_PRED, D113_PRED, D45_PRED, |
| }; |
| |
| static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = { |
| UV_DC_PRED, UV_CFL_PRED, UV_H_PRED, UV_V_PRED, |
| UV_SMOOTH_PRED, UV_PAETH_PRED, UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED, |
| UV_D135_PRED, UV_D203_PRED, UV_D157_PRED, UV_D67_PRED, |
| UV_D113_PRED, UV_D45_PRED, |
| }; |
| |
| typedef struct SingleInterModeState { |
| int64_t rd; |
| MV_REFERENCE_FRAME ref_frame; |
| int valid; |
| } SingleInterModeState; |
| |
| typedef struct InterModeSearchState { |
| int64_t best_rd; |
| MB_MODE_INFO best_mbmode; |
| int best_rate_y; |
| int best_rate_uv; |
| int best_mode_skippable; |
| int best_skip2; |
| int best_mode_index; |
| int skip_intra_modes; |
| int num_available_refs; |
| int64_t dist_refs[REF_FRAMES]; |
| int dist_order_refs[REF_FRAMES]; |
| int64_t mode_threshold[MAX_MODES]; |
| PREDICTION_MODE best_intra_mode; |
| int64_t best_intra_rd; |
| int angle_stats_ready; |
| uint8_t directional_mode_skip_mask[INTRA_MODES]; |
| unsigned int best_pred_sse; |
| int rate_uv_intra[TX_SIZES_ALL]; |
| int rate_uv_tokenonly[TX_SIZES_ALL]; |
| int64_t dist_uvs[TX_SIZES_ALL]; |
| int skip_uvs[TX_SIZES_ALL]; |
| UV_PREDICTION_MODE mode_uv[TX_SIZES_ALL]; |
| PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL]; |
| int8_t uv_angle_delta[TX_SIZES_ALL]; |
| int64_t best_pred_rd[REFERENCE_MODES]; |
| int64_t best_pred_diff[REFERENCE_MODES]; |
| // Save a set of single_newmv for each checked ref_mv. |
| int_mv single_newmv[MAX_REF_MV_SERCH][REF_FRAMES]; |
| int single_newmv_rate[MAX_REF_MV_SERCH][REF_FRAMES]; |
| int single_newmv_valid[MAX_REF_MV_SERCH][REF_FRAMES]; |
| int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES]; |
| // The rd of simple translation in single inter modes |
| int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES]; |
| |
| // Single search results by [directions][modes][reference frames] |
| SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS]; |
| int single_state_cnt[2][SINGLE_INTER_MODE_NUM]; |
| SingleInterModeState single_state_modelled[2][SINGLE_INTER_MODE_NUM] |
| [FWD_REFS]; |
| int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM]; |
| |
| MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS]; |
| } InterModeSearchState; |
| |
| static int inter_mode_data_block_idx(BLOCK_SIZE bsize) { |
| if (bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 || |
| bsize == BLOCK_4X16 || bsize == BLOCK_16X4) { |
| return -1; |
| } |
| return 1; |
| } |
| |
| void av1_inter_mode_data_init(TileDataEnc *tile_data) { |
| for (int i = 0; i < BLOCK_SIZES_ALL; ++i) { |
| InterModeRdModel *md = &tile_data->inter_mode_rd_models[i]; |
| md->ready = 0; |
| md->num = 0; |
| md->dist_sum = 0; |
| md->ld_sum = 0; |
| md->sse_sum = 0; |
| md->sse_sse_sum = 0; |
| md->sse_ld_sum = 0; |
| } |
| } |
| |
| static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize, |
| int64_t sse, int *est_residue_cost, |
| int64_t *est_dist) { |
| aom_clear_system_state(); |
| const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; |
| if (md->ready) { |
| if (sse < md->dist_mean) { |
| *est_residue_cost = 0; |
| *est_dist = sse; |
| } else { |
| *est_dist = (int64_t)round(md->dist_mean); |
| const double est_ld = md->a * sse + md->b; |
| // Clamp estimated rate cost by INT_MAX / 2. |
| // TODO(angiebird@google.com): find better solution than clamping. |
| if (fabs(est_ld) < 1e-2) { |
| *est_residue_cost = INT_MAX / 2; |
| } else { |
| double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld); |
| if (est_residue_cost_dbl < 0) { |
| *est_residue_cost = 0; |
| } else { |
| *est_residue_cost = |
| (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2); |
| } |
| } |
| if (*est_residue_cost <= 0) { |
| *est_residue_cost = 0; |
| *est_dist = sse; |
| } |
| } |
| return 1; |
| } |
| return 0; |
| } |
| |
| void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) { |
| aom_clear_system_state(); |
| for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { |
| const int block_idx = inter_mode_data_block_idx(bsize); |
| InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; |
| if (block_idx == -1) continue; |
| if ((md->ready == 0 && md->num < 200) || (md->ready == 1 && md->num < 64)) { |
| continue; |
| } else { |
| if (md->ready == 0) { |
| md->dist_mean = md->dist_sum / md->num; |
| md->ld_mean = md->ld_sum / md->num; |
| md->sse_mean = md->sse_sum / md->num; |
| md->sse_sse_mean = md->sse_sse_sum / md->num; |
| md->sse_ld_mean = md->sse_ld_sum / md->num; |
| } else { |
| const double factor = 3; |
| md->dist_mean = |
| (md->dist_mean * factor + (md->dist_sum / md->num)) / (factor + 1); |
| md->ld_mean = |
| (md->ld_mean * factor + (md->ld_sum / md->num)) / (factor + 1); |
| md->sse_mean = |
| (md->sse_mean * factor + (md->sse_sum / md->num)) / (factor + 1); |
| md->sse_sse_mean = |
| (md->sse_sse_mean * factor + (md->sse_sse_sum / md->num)) / |
| (factor + 1); |
| md->sse_ld_mean = |
| (md->sse_ld_mean * factor + (md->sse_ld_sum / md->num)) / |
| (factor + 1); |
| } |
| |
| const double my = md->ld_mean; |
| const double mx = md->sse_mean; |
| const double dx = sqrt(md->sse_sse_mean); |
| const double dxy = md->sse_ld_mean; |
| |
| md->a = (dxy - mx * my) / (dx * dx - mx * mx); |
| md->b = my - md->a * mx; |
| md->ready = 1; |
| |
| md->num = 0; |
| md->dist_sum = 0; |
| md->ld_sum = 0; |
| md->sse_sum = 0; |
| md->sse_sse_sum = 0; |
| md->sse_ld_sum = 0; |
| } |
| (void)rdmult; |
| } |
| } |
| |
| static void inter_mode_data_push(TileDataEnc *tile_data, BLOCK_SIZE bsize, |
| int64_t sse, int64_t dist, int residue_cost) { |
| if (residue_cost == 0 || sse == dist) return; |
| const int block_idx = inter_mode_data_block_idx(bsize); |
| if (block_idx == -1) return; |
| InterModeRdModel *rd_model = &tile_data->inter_mode_rd_models[bsize]; |
| if (rd_model->num < INTER_MODE_RD_DATA_OVERALL_SIZE) { |
| aom_clear_system_state(); |
| const double ld = (sse - dist) * 1. / residue_cost; |
| ++rd_model->num; |
| rd_model->dist_sum += dist; |
| rd_model->ld_sum += ld; |
| rd_model->sse_sum += sse; |
| rd_model->sse_sse_sum += (double)sse * (double)sse; |
| rd_model->sse_ld_sum += sse * ld; |
| } |
| } |
| |
| static void inter_modes_info_push(InterModesInfo *inter_modes_info, |
| int mode_rate, int64_t sse, int64_t rd, |
| bool true_rd, uint8_t *blk_skip, |
| RD_STATS *rd_cost, RD_STATS *rd_cost_y, |
| RD_STATS *rd_cost_uv, |
| const MB_MODE_INFO *mbmi) { |
| const int num = inter_modes_info->num; |
| assert(num < MAX_INTER_MODES); |
| inter_modes_info->mbmi_arr[num] = *mbmi; |
| inter_modes_info->mode_rate_arr[num] = mode_rate; |
| inter_modes_info->sse_arr[num] = sse; |
| inter_modes_info->est_rd_arr[num] = rd; |
| inter_modes_info->true_rd_arr[num] = true_rd; |
| if (blk_skip != NULL) { |
| memcpy(inter_modes_info->blk_skip_arr[num], blk_skip, |
| sizeof(blk_skip[0]) * MAX_MIB_SIZE * MAX_MIB_SIZE); |
| } |
| inter_modes_info->rd_cost_arr[num] = *rd_cost; |
| inter_modes_info->rd_cost_y_arr[num] = *rd_cost_y; |
| inter_modes_info->rd_cost_uv_arr[num] = *rd_cost_uv; |
| ++inter_modes_info->num; |
| } |
| |
| static int compare_rd_idx_pair(const void *a, const void *b) { |
| if (((RdIdxPair *)a)->rd == ((RdIdxPair *)b)->rd) { |
| return 0; |
| } else if (((const RdIdxPair *)a)->rd > ((const RdIdxPair *)b)->rd) { |
| return 1; |
| } else { |
| return -1; |
| } |
| } |
| |
| static void inter_modes_info_sort(const InterModesInfo *inter_modes_info, |
| RdIdxPair *rd_idx_pair_arr) { |
| if (inter_modes_info->num == 0) { |
| return; |
| } |
| for (int i = 0; i < inter_modes_info->num; ++i) { |
| rd_idx_pair_arr[i].idx = i; |
| rd_idx_pair_arr[i].rd = inter_modes_info->est_rd_arr[i]; |
| } |
| qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]), |
| compare_rd_idx_pair); |
| } |
| |
| static INLINE int write_uniform_cost(int n, int v) { |
| const int l = get_unsigned_bits(n); |
| const int m = (1 << l) - n; |
| if (l == 0) return 0; |
| if (v < m) |
| return av1_cost_literal(l - 1); |
| else |
| return av1_cost_literal(l); |
| } |
| |
| // Similar to store_cfl_required(), but for use during the RDO process, |
| // where we haven't yet determined whether this block uses CfL. |
| static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm, |
| const MACROBLOCK *x) { |
| const MACROBLOCKD *xd = &x->e_mbd; |
| |
| if (cm->seq_params.monochrome || x->skip_chroma_rd) return CFL_DISALLOWED; |
| |
| if (!xd->cfl.is_chroma_reference) { |
| // For non-chroma-reference blocks, we should always store the luma pixels, |
| // in case the corresponding chroma-reference block uses CfL. |
| // Note that this can only happen for block sizes which are <8 on |
| // their shortest side, as otherwise they would be chroma reference |
| // blocks. |
| return CFL_ALLOWED; |
| } |
| |
| // For chroma reference blocks, we should store data in the encoder iff we're |
| // allowed to try out CfL. |
| return is_cfl_allowed(xd); |
| } |
| |
| // constants for prune 1 and prune 2 decision boundaries |
| #define FAST_EXT_TX_CORR_MID 0.0 |
| #define FAST_EXT_TX_EDST_MID 0.1 |
| #define FAST_EXT_TX_CORR_MARGIN 0.5 |
| #define FAST_EXT_TX_EDST_MARGIN 0.3 |
| |
| static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, |
| RD_STATS *rd_stats, BLOCK_SIZE bsize, |
| int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode); |
| |
| static unsigned pixel_dist_visible_only( |
| const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src, |
| const int src_stride, const uint8_t *dst, const int dst_stride, |
| const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows, |
| int visible_cols) { |
| unsigned sse; |
| |
| if (txb_rows == visible_rows && txb_cols == visible_cols) { |
| cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse); |
| return sse; |
| } |
| const MACROBLOCKD *xd = &x->e_mbd; |
| |
| if (is_cur_buf_hbd(xd)) { |
| uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride, |
| visible_cols, visible_rows); |
| return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2); |
| } |
| sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols, |
| visible_rows); |
| return sse; |
| } |
| |
| #if CONFIG_DIST_8X8 |
| static uint64_t cdef_dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src, |
| int sstride, int coeff_shift) { |
| uint64_t svar = 0; |
| uint64_t dvar = 0; |
| uint64_t sum_s = 0; |
| uint64_t sum_d = 0; |
| uint64_t sum_s2 = 0; |
| uint64_t sum_d2 = 0; |
| uint64_t sum_sd = 0; |
| uint64_t dist = 0; |
| |
| int i, j; |
| for (i = 0; i < 8; i++) { |
| for (j = 0; j < 8; j++) { |
| sum_s += src[i * sstride + j]; |
| sum_d += dst[i * dstride + j]; |
| sum_s2 += src[i * sstride + j] * src[i * sstride + j]; |
| sum_d2 += dst[i * dstride + j] * dst[i * dstride + j]; |
| sum_sd += src[i * sstride + j] * dst[i * dstride + j]; |
| } |
| } |
| /* Compute the variance -- the calculation cannot go negative. */ |
| svar = sum_s2 - ((sum_s * sum_s + 32) >> 6); |
| dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6); |
| |
| // Tuning of jm's original dering distortion metric used in CDEF tool, |
| // suggested by jm |
| const uint64_t a = 4; |
| const uint64_t b = 2; |
| const uint64_t c1 = (400 * a << 2 * coeff_shift); |
| const uint64_t c2 = (b * 20000 * a * a << 4 * coeff_shift); |
| |
| dist = (uint64_t)floor(.5 + (sum_d2 + sum_s2 - 2 * sum_sd) * .5 * |
| (svar + dvar + c1) / |
| (sqrt(svar * (double)dvar + c2))); |
| |
| // Calibrate dist to have similar rate for the same QP with MSE only |
| // distortion (as in master branch) |
| dist = (uint64_t)((float)dist * 0.75); |
| |
| return dist; |
| } |
| |
| static int od_compute_var_4x4(uint16_t *x, int stride) { |
| int sum; |
| int s2; |
| int i; |
| sum = 0; |
| s2 = 0; |
| for (i = 0; i < 4; i++) { |
| int j; |
| for (j = 0; j < 4; j++) { |
| int t; |
| |
| t = x[i * stride + j]; |
| sum += t; |
| s2 += t * t; |
| } |
| } |
| |
| return (s2 - (sum * sum >> 4)) >> 4; |
| } |
| |
| /* OD_DIST_LP_MID controls the frequency weighting filter used for computing |
| the distortion. For a value X, the filter is [1 X 1]/(X + 2) and |
| is applied both horizontally and vertically. For X=5, the filter is |
| a good approximation for the OD_QM8_Q4_HVS quantization matrix. */ |
| #define OD_DIST_LP_MID (5) |
| #define OD_DIST_LP_NORM (OD_DIST_LP_MID + 2) |
| |
| static double od_compute_dist_8x8(int use_activity_masking, uint16_t *x, |
| uint16_t *y, od_coeff *e_lp, int stride) { |
| double sum; |
| int min_var; |
| double mean_var; |
| double var_stat; |
| double activity; |
| double calibration; |
| int i; |
| int j; |
| double vardist; |
| |
| vardist = 0; |
| |
| #if 1 |
| min_var = INT_MAX; |
| mean_var = 0; |
| for (i = 0; i < 3; i++) { |
| for (j = 0; j < 3; j++) { |
| int varx; |
| int vary; |
| varx = od_compute_var_4x4(x + 2 * i * stride + 2 * j, stride); |
| vary = od_compute_var_4x4(y + 2 * i * stride + 2 * j, stride); |
| min_var = OD_MINI(min_var, varx); |
| mean_var += 1. / (1 + varx); |
| /* The cast to (double) is to avoid an overflow before the sqrt.*/ |
| vardist += varx - 2 * sqrt(varx * (double)vary) + vary; |
| } |
| } |
| /* We use a different variance statistic depending on whether activity |
| masking is used, since the harmonic mean appeared slightly worse with |
| masking off. The calibration constant just ensures that we preserve the |
| rate compared to activity=1. */ |
| if (use_activity_masking) { |
| calibration = 1.95; |
| var_stat = 9. / mean_var; |
| } else { |
| calibration = 1.62; |
| var_stat = min_var; |
| } |
| /* 1.62 is a calibration constant, 0.25 is a noise floor and 1/6 is the |
| activity masking constant. */ |
| activity = calibration * pow(.25 + var_stat, -1. / 6); |
| #else |
| activity = 1; |
| #endif // 1 |
| sum = 0; |
| for (i = 0; i < 8; i++) { |
| for (j = 0; j < 8; j++) |
| sum += e_lp[i * stride + j] * (double)e_lp[i * stride + j]; |
| } |
| /* Normalize the filter to unit DC response. */ |
| sum *= 1. / (OD_DIST_LP_NORM * OD_DIST_LP_NORM * OD_DIST_LP_NORM * |
| OD_DIST_LP_NORM); |
| return activity * activity * (sum + vardist); |
| } |
| |
| // Note : Inputs x and y are in a pixel domain |
| static double od_compute_dist_common(int activity_masking, uint16_t *x, |
| uint16_t *y, int bsize_w, int bsize_h, |
| int qindex, od_coeff *tmp, |
| od_coeff *e_lp) { |
| int i, j; |
| double sum = 0; |
| const int mid = OD_DIST_LP_MID; |
| |
| for (j = 0; j < bsize_w; j++) { |
| e_lp[j] = mid * tmp[j] + 2 * tmp[bsize_w + j]; |
| e_lp[(bsize_h - 1) * bsize_w + j] = mid * tmp[(bsize_h - 1) * bsize_w + j] + |
| 2 * tmp[(bsize_h - 2) * bsize_w + j]; |
| } |
| for (i = 1; i < bsize_h - 1; i++) { |
| for (j = 0; j < bsize_w; j++) { |
| e_lp[i * bsize_w + j] = mid * tmp[i * bsize_w + j] + |
| tmp[(i - 1) * bsize_w + j] + |
| tmp[(i + 1) * bsize_w + j]; |
| } |
| } |
| for (i = 0; i < bsize_h; i += 8) { |
| for (j = 0; j < bsize_w; j += 8) { |
| sum += od_compute_dist_8x8(activity_masking, &x[i * bsize_w + j], |
| &y[i * bsize_w + j], &e_lp[i * bsize_w + j], |
| bsize_w); |
| } |
| } |
| /* Scale according to linear regression against SSE, for 8x8 blocks. */ |
| if (activity_masking) { |
| sum *= 2.2 + (1.7 - 2.2) * (qindex - 99) / (210 - 99) + |
| (qindex < 99 ? 2.5 * (qindex - 99) / 99 * (qindex - 99) / 99 : 0); |
| } else { |
| sum *= qindex >= 128 |
| ? 1.4 + (0.9 - 1.4) * (qindex - 128) / (209 - 128) |
| : qindex <= 43 ? 1.5 + (2.0 - 1.5) * (qindex - 43) / (16 - 43) |
| : 1.5 + (1.4 - 1.5) * (qindex - 43) / (128 - 43); |
| } |
| |
| return sum; |
| } |
| |
| static double od_compute_dist(uint16_t *x, uint16_t *y, int bsize_w, |
| int bsize_h, int qindex) { |
| assert(bsize_w >= 8 && bsize_h >= 8); |
| |
| int activity_masking = 0; |
| |
| int i, j; |
| DECLARE_ALIGNED(16, od_coeff, e[MAX_SB_SQUARE]); |
| DECLARE_ALIGNED(16, od_coeff, tmp[MAX_SB_SQUARE]); |
| DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_SB_SQUARE]); |
| for (i = 0; i < bsize_h; i++) { |
| for (j = 0; j < bsize_w; j++) { |
| e[i * bsize_w + j] = x[i * bsize_w + j] - y[i * bsize_w + j]; |
| } |
| } |
| int mid = OD_DIST_LP_MID; |
| for (i = 0; i < bsize_h; i++) { |
| tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1]; |
| tmp[i * bsize_w + bsize_w - 1] = |
| mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2]; |
| for (j = 1; j < bsize_w - 1; j++) { |
| tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] + e[i * bsize_w + j - 1] + |
| e[i * bsize_w + j + 1]; |
| } |
| } |
| return od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h, |
| qindex, tmp, e_lp); |
| } |
| |
| static double od_compute_dist_diff(uint16_t *x, int16_t *e, int bsize_w, |
| int bsize_h, int qindex) { |
| assert(bsize_w >= 8 && bsize_h >= 8); |
| |
| int activity_masking = 0; |
| |
| DECLARE_ALIGNED(16, uint16_t, y[MAX_SB_SQUARE]); |
| DECLARE_ALIGNED(16, od_coeff, tmp[MAX_SB_SQUARE]); |
| DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_SB_SQUARE]); |
| int i, j; |
| for (i = 0; i < bsize_h; i++) { |
| for (j = 0; j < bsize_w; j++) { |
| y[i * bsize_w + j] = x[i * bsize_w + j] - e[i * bsize_w + j]; |
| } |
| } |
| int mid = OD_DIST_LP_MID; |
| for (i = 0; i < bsize_h; i++) { |
| tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1]; |
| tmp[i * bsize_w + bsize_w - 1] = |
| mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2]; |
| for (j = 1; j < bsize_w - 1; j++) { |
| tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] + e[i * bsize_w + j - 1] + |
| e[i * bsize_w + j + 1]; |
| } |
| } |
| return od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h, |
| qindex, tmp, e_lp); |
| } |
| |
| int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x, |
| const uint8_t *src, int src_stride, const uint8_t *dst, |
| int dst_stride, const BLOCK_SIZE tx_bsize, int bsw, |
| int bsh, int visible_w, int visible_h, int qindex) { |
| int64_t d = 0; |
| int i, j; |
| const MACROBLOCKD *xd = &x->e_mbd; |
| |
| DECLARE_ALIGNED(16, uint16_t, orig[MAX_SB_SQUARE]); |
| DECLARE_ALIGNED(16, uint16_t, rec[MAX_SB_SQUARE]); |
| |
| assert(bsw >= 8); |
| assert(bsh >= 8); |
| assert((bsw & 0x07) == 0); |
| assert((bsh & 0x07) == 0); |
| |
| if (x->tune_metric == AOM_TUNE_CDEF_DIST || |
| x->tune_metric == AOM_TUNE_DAALA_DIST) { |
| if (is_cur_buf_hbd(xd)) { |
| for (j = 0; j < bsh; j++) |
| for (i = 0; i < bsw; i++) |
| orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i]; |
| |
| if ((bsw == visible_w) && (bsh == visible_h)) { |
| for (j = 0; j < bsh; j++) |
| for (i = 0; i < bsw; i++) |
| rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i]; |
| } else { |
| for (j = 0; j < visible_h; j++) |
| for (i = 0; i < visible_w; i++) |
| rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i]; |
| |
| if (visible_w < bsw) { |
| for (j = 0; j < bsh; j++) |
| for (i = visible_w; i < bsw; i++) |
| rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i]; |
| } |
| |
| if (visible_h < bsh) { |
| for (j = visible_h; j < bsh; j++) |
| for (i = 0; i < bsw; i++) |
| rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i]; |
| } |
| } |
| } else { |
| for (j = 0; j < bsh; j++) |
| for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i]; |
| |
| if ((bsw == visible_w) && (bsh == visible_h)) { |
| for (j = 0; j < bsh; j++) |
| for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i]; |
| } else { |
| for (j = 0; j < visible_h; j++) |
| for (i = 0; i < visible_w; i++) |
| rec[j * bsw + i] = dst[j * dst_stride + i]; |
| |
| if (visible_w < bsw) { |
| for (j = 0; j < bsh; j++) |
| for (i = visible_w; i < bsw; i++) |
| rec[j * bsw + i] = src[j * src_stride + i]; |
| } |
| |
| if (visible_h < bsh) { |
| for (j = visible_h; j < bsh; j++) |
| for (i = 0; i < bsw; i++) |
| rec[j * bsw + i] = src[j * src_stride + i]; |
| } |
| } |
| } |
| } |
| |
| if (x->tune_metric == AOM_TUNE_DAALA_DIST) { |
| d = (int64_t)od_compute_dist(orig, rec, bsw, bsh, qindex); |
| } else if (x->tune_metric == AOM_TUNE_CDEF_DIST) { |
| int coeff_shift = AOMMAX(xd->bd - 8, 0); |
| |
| for (i = 0; i < bsh; i += 8) { |
| for (j = 0; j < bsw; j += 8) { |
| d += cdef_dist_8x8_16bit(&rec[i * bsw + j], bsw, &orig[i * bsw + j], |
| bsw, coeff_shift); |
| } |
| } |
| if (is_cur_buf_hbd(xd)) d = ((uint64_t)d) >> 2 * coeff_shift; |
| } else { |
| // Otherwise, MSE by default |
| d = pixel_dist_visible_only(cpi, x, src, src_stride, dst, dst_stride, |
| tx_bsize, bsh, bsw, visible_h, visible_w); |
| } |
| |
| return d; |
| } |
| |
| static int64_t dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src, |
| int src_stride, const int16_t *diff, |
| int diff_stride, int bsw, int bsh, int visible_w, |
| int visible_h, int qindex) { |
| int64_t d = 0; |
| int i, j; |
| const MACROBLOCKD *xd = &x->e_mbd; |
| |
| DECLARE_ALIGNED(16, uint16_t, orig[MAX_SB_SQUARE]); |
| DECLARE_ALIGNED(16, int16_t, diff16[MAX_SB_SQUARE]); |
| |
| assert(bsw >= 8); |
| assert(bsh >= 8); |
| assert((bsw & 0x07) == 0); |
| assert((bsh & 0x07) == 0); |
| |
| if (x->tune_metric == AOM_TUNE_CDEF_DIST || |
| x->tune_metric == AOM_TUNE_DAALA_DIST) { |
| if (is_cur_buf_hbd(xd)) { |
| for (j = 0; j < bsh; j++) |
| for (i = 0; i < bsw; i++) |
| orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i]; |
| } else { |
| for (j = 0; j < bsh; j++) |
| for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i]; |
| } |
| |
| if ((bsw == visible_w) && (bsh == visible_h)) { |
| for (j = 0; j < bsh; j++) |
| for (i = 0; i < bsw; i++) |
| diff16[j * bsw + i] = diff[j * diff_stride + i]; |
| } else { |
| for (j = 0; j < visible_h; j++) |
| for (i = 0; i < visible_w; i++) |
| diff16[j * bsw + i] = diff[j * diff_stride + i]; |
| |
| if (visible_w < bsw) { |
| for (j = 0; j < bsh; j++) |
| for (i = visible_w; i < bsw; i++) diff16[j * bsw + i] = 0; |
| } |
| |
| if (visible_h < bsh) { |
| for (j = visible_h; j < bsh; j++) |
| for (i = 0; i < bsw; i++) diff16[j * bsw + i] = 0; |
| } |
| } |
| } |
| |
| if (x->tune_metric == AOM_TUNE_DAALA_DIST) { |
| d = (int64_t)od_compute_dist_diff(orig, diff16, bsw, bsh, qindex); |
| } else if (x->tune_metric == AOM_TUNE_CDEF_DIST) { |
| int coeff_shift = AOMMAX(xd->bd - 8, 0); |
| DECLARE_ALIGNED(16, uint16_t, dst16[MAX_SB_SQUARE]); |
| |
| for (i = 0; i < bsh; i++) { |
| for (j = 0; j < bsw; j++) { |
| dst16[i * bsw + j] = orig[i * bsw + j] - diff16[i * bsw + j]; |
| } |
| } |
| |
| for (i = 0; i < bsh; i += 8) { |
| for (j = 0; j < bsw; j += 8) { |
| d += cdef_dist_8x8_16bit(&dst16[i * bsw + j], bsw, &orig[i * bsw + j], |
| bsw, coeff_shift); |
| } |
| } |
| // Don't scale 'd' for HBD since it will be done by caller side for diff |
| // input |
| } else { |
| // Otherwise, MSE by default |
| d = aom_sum_squares_2d_i16(diff, diff_stride, visible_w, visible_h); |
| } |
| |
| return d; |
| } |
| #endif // CONFIG_DIST_8X8 |
| |
| static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize, |
| const uint8_t *src, int src_stride, |
| const uint8_t *dst, int dst_stride, |
| int need_4th, double *hordist, |
| double *verdist) { |
| const int bw = block_size_wide[bsize]; |
| const int bh = block_size_high[bsize]; |
| unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; |
| |
| if (bsize < BLOCK_16X16 || (bsize >= BLOCK_4X16 && bsize <= BLOCK_32X8)) { |
| // Special cases: calculate 'esq' values manually, as we don't have 'vf' |
| // functions for the 16 (very small) sub-blocks of this block. |
| const int w_shift = (bw == 4) ? 0 : (bw == 8) ? 1 : (bw == 16) ? 2 : 3; |
| const int h_shift = (bh == 4) ? 0 : (bh == 8) ? 1 : (bh == 16) ? 2 : 3; |
| assert(bw <= 32); |
| assert(bh <= 32); |
| assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15); |
| if (cpi->common.seq_params.use_highbitdepth) { |
| const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); |
| const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); |
| for (int i = 0; i < bh; ++i) |
| for (int j = 0; j < bw; ++j) { |
| const int index = (j >> w_shift) + ((i >> h_shift) << 2); |
| esq[index] += |
| (src16[j + i * src_stride] - dst16[j + i * dst_stride]) * |
| (src16[j + i * src_stride] - dst16[j + i * dst_stride]); |
| } |
| } else { |
| for (int i = 0; i < bh; ++i) |
| for (int j = 0; j < bw; ++j) { |
| const int index = (j >> w_shift) + ((i >> h_shift) << 2); |
| esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) * |
| (src[j + i * src_stride] - dst[j + i * dst_stride]); |
| } |
| } |
| } else { // Calculate 'esq' values using 'vf' functions on the 16 sub-blocks. |
| const int f_index = |
| (bsize < BLOCK_SIZES) ? bsize - BLOCK_16X16 : bsize - BLOCK_8X16; |
| assert(f_index >= 0 && f_index < BLOCK_SIZES_ALL); |
| const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index; |
| assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]); |
| assert(block_size_high[bsize] == 4 * block_size_high[subsize]); |
| cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]); |
| cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, |
| &esq[1]); |
| cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, |
| &esq[2]); |
| cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, |
| dst_stride, &esq[3]); |
| src += bh / 4 * src_stride; |
| dst += bh / 4 * dst_stride; |
| |
| cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]); |
| cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, |
| &esq[5]); |
| cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, |
| &esq[6]); |
| cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, |
| dst_stride, &esq[7]); |
| src += bh / 4 * src_stride; |
| dst += bh / 4 * dst_stride; |
| |
| cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]); |
| cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, |
| &esq[9]); |
| cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, |
| &esq[10]); |
| cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, |
| dst_stride, &esq[11]); |
| src += bh / 4 * src_stride; |
| dst += bh / 4 * dst_stride; |
| |
| cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]); |
| cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, |
| &esq[13]); |
| cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, |
| &esq[14]); |
| cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, |
| dst_stride, &esq[15]); |
| } |
| |
| double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] + |
| esq[6] + esq[7] + esq[8] + esq[9] + esq[10] + esq[11] + |
| esq[12] + esq[13] + esq[14] + esq[15]; |
| if (total > 0) { |
| const double e_recip = 1.0 / total; |
| hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip; |
| hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip; |
| hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip; |
| if (need_4th) { |
| hordist[3] = ((double)esq[3] + esq[7] + esq[11] + esq[15]) * e_recip; |
| } |
| verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip; |
| verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip; |
| verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip; |
| if (need_4th) { |
| verdist[3] = ((double)esq[12] + esq[13] + esq[14] + esq[15]) * e_recip; |
| } |
| } else { |
| hordist[0] = verdist[0] = 0.25; |
| hordist[1] = verdist[1] = 0.25; |
| hordist[2] = verdist[2] = 0.25; |
| if (need_4th) { |
| hordist[3] = verdist[3] = 0.25; |
| } |
| } |
| } |
| |
| static int adst_vs_flipadst(const AV1_COMP *cpi, BLOCK_SIZE bsize, |
| const uint8_t *src, int src_stride, |
| const uint8_t *dst, int dst_stride) { |
| int prune_bitmask = 0; |
| double svm_proj_h = 0, svm_proj_v = 0; |
| double hdist[3] = { 0, 0, 0 }, vdist[3] = { 0, 0, 0 }; |
| get_energy_distribution_fine(cpi, bsize, src, src_stride, dst, dst_stride, 0, |
| hdist, vdist); |
| |
| svm_proj_v = vdist[0] * ADST_FLIP_SVM[0] + vdist[1] * ADST_FLIP_SVM[1] + |
| vdist[2] * ADST_FLIP_SVM[2] + ADST_FLIP_SVM[3]; |
| svm_proj_h = hdist[0] * ADST_FLIP_SVM[4] + hdist[1] * ADST_FLIP_SVM[5] + |
| hdist[2] * ADST_FLIP_SVM[6] + ADST_FLIP_SVM[7]; |
| if (svm_proj_v > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN) |
| prune_bitmask |= 1 << FLIPADST_1D; |
| else if (svm_proj_v < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN) |
| prune_bitmask |= 1 << ADST_1D; |
| |
| if (svm_proj_h > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN) |
| prune_bitmask |= 1 << (FLIPADST_1D + 8); |
| else if (svm_proj_h < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN) |
| prune_bitmask |= 1 << (ADST_1D + 8); |
| |
| return prune_bitmask; |
| } |
| |
| static int dct_vs_idtx(const int16_t *diff, int stride, int w, int h) { |
| float hcorr, vcorr; |
| int prune_bitmask = 0; |
| av1_get_horver_correlation_full(diff, stride, w, h, &hcorr, &vcorr); |
| |
| if (vcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN) |
| prune_bitmask |= 1 << IDTX_1D; |
| else if (vcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN) |
| prune_bitmask |= 1 << DCT_1D; |
| |
| if (hcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN) |
| prune_bitmask |= 1 << (IDTX_1D + 8); |
| else if (hcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN) |
| prune_bitmask |= 1 << (DCT_1D + 8); |
| return prune_bitmask; |
| } |
| |
| // Performance drop: 0.5%, Speed improvement: 24% |
| static int prune_two_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize, |
| MACROBLOCK *x, const MACROBLOCKD *xd, |
| int adst_flipadst, int dct_idtx) { |
| int prune = 0; |
| |
| if (adst_flipadst) { |
| const struct macroblock_plane *const p = &x->plane[0]; |
| const struct macroblockd_plane *const pd = &xd->plane[0]; |
| prune |= adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride, |
| pd->dst.buf, pd->dst.stride); |
| } |
| if (dct_idtx) { |
| av1_subtract_plane(x, bsize, 0); |
| const struct macroblock_plane *const p = &x->plane[0]; |
| const int bw = block_size_wide[bsize]; |
| const int bh = block_size_high[bsize]; |
| prune |= dct_vs_idtx(p->src_diff, bw, bw, bh); |
| } |
| |
| return prune; |
| } |
| |
| // Performance drop: 0.3%, Speed improvement: 5% |
| static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize, |
| const MACROBLOCK *x, const MACROBLOCKD *xd) { |
| const struct macroblock_plane *const p = &x->plane[0]; |
| const struct macroblockd_plane *const pd = &xd->plane[0]; |
| return adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride, pd->dst.buf, |
| pd->dst.stride); |
| } |
| |
| // 1D Transforms used in inter set, this needs to be changed if |
| // ext_tx_used_inter is changed |
| static const int ext_tx_used_inter_1D[EXT_TX_SETS_INTER][TX_TYPES_1D] = { |
| { 1, 0, 0, 0 }, |
| { 1, 1, 1, 1 }, |
| { 1, 1, 1, 1 }, |
| { 1, 0, 0, 1 }, |
| }; |
| |
| static void get_energy_distribution_finer(const int16_t *diff, int stride, |
| int bw, int bh, float *hordist, |
| float *verdist) { |
| // First compute downscaled block energy values (esq); downscale factors |
| // are defined by w_shift and h_shift. |
| unsigned int esq[256]; |
| const int w_shift = bw <= 8 ? 0 : 1; |
| const int h_shift = bh <= 8 ? 0 : 1; |
| const int esq_w = bw >> w_shift; |
| const int esq_h = bh >> h_shift; |
| const int esq_sz = esq_w * esq_h; |
| int i, j; |
| memset(esq, 0, esq_sz * sizeof(esq[0])); |
| if (w_shift) { |
| for (i = 0; i < bh; i++) { |
| unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w; |
| const int16_t *cur_diff_row = diff + i * stride; |
| for (j = 0; j < bw; j += 2) { |
| cur_esq_row[j >> 1] += (cur_diff_row[j] * cur_diff_row[j] + |
| cur_diff_row[j + 1] * cur_diff_row[j + 1]); |
| } |
| } |
| } else { |
| for (i = 0; i < bh; i++) { |
| unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w; |
| const int16_t *cur_diff_row = diff + i * stride; |
| for (j = 0; j < bw; j++) { |
| cur_esq_row[j] += cur_diff_row[j] * cur_diff_row[j]; |
| } |
| } |
| } |
| |
| uint64_t total = 0; |
| for (i = 0; i < esq_sz; i++) total += esq[i]; |
| |
| // Output hordist and verdist arrays are normalized 1D projections of esq |
| if (total == 0) { |
| float hor_val = 1.0f / esq_w; |
| for (j = 0; j < esq_w - 1; j++) hordist[j] = hor_val; |
| float ver_val = 1.0f / esq_h; |
| for (i = 0; i < esq_h - 1; i++) verdist[i] = ver_val; |
| return; |
| } |
| |
| const float e_recip = 1.0f / (float)total; |
| memset(hordist, 0, (esq_w - 1) * sizeof(hordist[0])); |
| memset(verdist, 0, (esq_h - 1) * sizeof(verdist[0])); |
| const unsigned int *cur_esq_row; |
| for (i = 0; i < esq_h - 1; i++) { |
| cur_esq_row = esq + i * esq_w; |
| for (j = 0; j < esq_w - 1; j++) { |
| hordist[j] += (float)cur_esq_row[j]; |
| verdist[i] += (float)cur_esq_row[j]; |
| } |
| verdist[i] += (float)cur_esq_row[j]; |
| } |
| cur_esq_row = esq + i * esq_w; |
| for (j = 0; j < esq_w - 1; j++) hordist[j] += (float)cur_esq_row[j]; |
| |
| for (j = 0; j < esq_w - 1; j++) hordist[j] *= e_recip; |
| for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip; |
| } |
| |
| // Similar to get_horver_correlation, but also takes into account first |
| // row/column, when computing horizontal/vertical correlation. |
| void av1_get_horver_correlation_full_c(const int16_t *diff, int stride, |
| int width, int height, float *hcorr, |
| float *vcorr) { |
| // The following notation is used: |
| // x - current pixel |
| // y - left neighbor pixel |
| // z - top neighbor pixel |
| int64_t x_sum = 0, x2_sum = 0, xy_sum = 0, xz_sum = 0; |
| int64_t x_firstrow = 0, x_finalrow = 0, x_firstcol = 0, x_finalcol = 0; |
| int64_t x2_firstrow = 0, x2_finalrow = 0, x2_firstcol = 0, x2_finalcol = 0; |
| |
| // First, process horizontal correlation on just the first row |
| x_sum += diff[0]; |
| x2_sum += diff[0] * diff[0]; |
| x_firstrow += diff[0]; |
| x2_firstrow += diff[0] * diff[0]; |
| for (int j = 1; j < width; ++j) { |
| const int16_t x = diff[j]; |
| const int16_t y = diff[j - 1]; |
| x_sum += x; |
| x_firstrow += x; |
| x2_sum += x * x; |
| x2_firstrow += x * x; |
| xy_sum += x * y; |
| } |
| |
| // Process vertical correlation in the first column |
| x_firstcol += diff[0]; |
| x2_firstcol += diff[0] * diff[0]; |
| for (int i = 1; i < height; ++i) { |
| const int16_t x = diff[i * stride]; |
| const int16_t z = diff[(i - 1) * stride]; |
| x_sum += x; |
| x_firstcol += x; |
| x2_sum += x * x; |
| x2_firstcol += x * x; |
| xz_sum += x * z; |
| } |
| |
| // Now process horiz and vert correlation through the rest unit |
| for (int i = 1; i < height; ++i) { |
| for (int j = 1; j < width; ++j) { |
| const int16_t x = diff[i * stride + j]; |
| const int16_t y = diff[i * stride + j - 1]; |
| const int16_t z = diff[(i - 1) * stride + j]; |
| x_sum += x; |
| x2_sum += x * x; |
| xy_sum += x * y; |
| xz_sum += x * z; |
| } |
| } |
| |
| for (int j = 0; j < width; ++j) { |
| x_finalrow += diff[(height - 1) * stride + j]; |
| x2_finalrow += |
| diff[(height - 1) * stride + j] * diff[(height - 1) * stride + j]; |
| } |
| for (int i = 0; i < height; ++i) { |
| x_finalcol += diff[i * stride + width - 1]; |
| x2_finalcol += diff[i * stride + width - 1] * diff[i * stride + width - 1]; |
| } |
| |
| int64_t xhor_sum = x_sum - x_finalcol; |
| int64_t xver_sum = x_sum - x_finalrow; |
| int64_t y_sum = x_sum - x_firstcol; |
| int64_t z_sum = x_sum - x_firstrow; |
| int64_t x2hor_sum = x2_sum - x2_finalcol; |
| int64_t x2ver_sum = x2_sum - x2_finalrow; |
| int64_t y2_sum = x2_sum - x2_firstcol; |
| int64_t z2_sum = x2_sum - x2_firstrow; |
| |
| const float num_hor = (float)(height * (width - 1)); |
| const float num_ver = (float)((height - 1) * width); |
| |
| const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor; |
| const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver; |
| |
| const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor; |
| const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver; |
| |
| const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor; |
| const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver; |
| |
| if (xhor_var_n > 0 && y_var_n > 0) { |
| *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n); |
| *hcorr = *hcorr < 0 ? 0 : *hcorr; |
| } else { |
| *hcorr = 1.0; |
| } |
| if (xver_var_n > 0 && z_var_n > 0) { |
| *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n); |
| *vcorr = *vcorr < 0 ? 0 : *vcorr; |
| } else { |
| *vcorr = 1.0; |
| } |
| } |
| |
| // Transforms raw scores into a probability distribution across 16 TX types |
| static void score_2D_transform_pow8(float *scores_2D, float shift) { |
| float sum = 0.0f; |
| int i; |
| for (i = 0; i < 16; i++) { |
| const float v = AOMMIN(AOMMAX(scores_2D[i] + shift, 0.0f), 100.0f); |
| const float v2 = v * v; |
| const float v4 = v2 * v2; |
| scores_2D[i] = v4 * v4; |
| sum += scores_2D[i]; |
| } |
| for (i = 0; i < 16; i++) { |
| if (scores_2D[i] < sum * 1e-4) |
| scores_2D[i] = 0.0f; |
| else |
| scores_2D[i] /= sum; |
| } |
| } |
| |
| // These thresholds were calibrated to provide a certain number of TX types |
| // pruned by the model on average, i.e. selecting a threshold with index i |
| // will lead to pruning i+1 TX types on average |
| static const float *prune_2D_adaptive_thresholds[] = { |
| // TX_4X4 |
| (float[]){ 0.00549f, 0.01306f, 0.02039f, 0.02747f, 0.03406f, 0.04065f, |
| 0.04724f, 0.05383f, 0.06067f, 0.06799f, 0.07605f, 0.08533f, |
| 0.09778f, 0.11780f }, |
| // TX_8X8 |
| (float[]){ 0.00037f, 0.00183f, 0.00525f, 0.01038f, 0.01697f, 0.02502f, |
| 0.03381f, 0.04333f, 0.05286f, 0.06287f, 0.07434f, 0.08850f, |
| 0.10803f, 0.14124f }, |
| // TX_16X16 |
| (float[]){ 0.01404f, 0.02820f, 0.04211f, 0.05164f, 0.05798f, 0.06335f, |
| 0.06897f, 0.07629f, 0.08875f, 0.11169f }, |
| // TX_32X32 |
| NULL, |
| // TX_64X64 |
| NULL, |
| // TX_4X8 |
| (float[]){ 0.00183f, 0.00745f, 0.01428f, 0.02185f, 0.02966f, 0.03723f, |
| 0.04456f, 0.05188f, 0.05920f, 0.06702f, 0.07605f, 0.08704f, |
| 0.10168f, 0.12585f }, |
| // TX_8X4 |
| (float[]){ 0.00085f, 0.00476f, 0.01135f, 0.01892f, 0.02698f, 0.03528f, |
| 0.04358f, 0.05164f, 0.05994f, 0.06848f, 0.07849f, 0.09021f, |
| 0.10583f, 0.13123f }, |
| // TX_8X16 |
| (float[]){ 0.00037f, 0.00232f, 0.00671f, 0.01257f, 0.01965f, 0.02722f, |
| 0.03552f, 0.04382f, 0.05237f, 0.06189f, 0.07336f, 0.08728f, |
| 0.10730f, 0.14221f }, |
| // TX_16X8 |
| (float[]){ 0.00061f, 0.00330f, 0.00818f, 0.01453f, 0.02185f, 0.02966f, |
| 0.03772f, 0.04578f, 0.05383f, 0.06262f, 0.07288f, 0.08582f, |
| 0.10339f, 0.13464f }, |
| // TX_16X32 |
| NULL, |
| // TX_32X16 |
| NULL, |
| // TX_32X64 |
| NULL, |
| // TX_64X32 |
| NULL, |
| // TX_4X16 |
| (float[]){ 0.00232f, 0.00671f, 0.01257f, 0.01941f, 0.02673f, 0.03430f, |
| 0.04211f, 0.04968f, 0.05750f, 0.06580f, 0.07507f, 0.08655f, |
| 0.10242f, 0.12878f }, |
| // TX_16X4 |
| (float[]){ 0.00110f, 0.00525f, 0.01208f, 0.01990f, 0.02795f, 0.03601f, |
| 0.04358f, 0.05115f, 0.05896f, 0.06702f, 0.07629f, 0.08752f, |
| 0.10217f, 0.12610f }, |
| // TX_8X32 |
| NULL, |
| // TX_32X8 |
| NULL, |
| // TX_16X64 |
| NULL, |
| // TX_64X16 |
| NULL, |
| }; |
| |
| static uint16_t prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size, |
| int blk_row, int blk_col, TxSetType tx_set_type, |
| TX_TYPE_PRUNE_MODE prune_mode) { |
| static const int tx_type_table_2D[16] = { |
| DCT_DCT, DCT_ADST, DCT_FLIPADST, V_DCT, |
| ADST_DCT, ADST_ADST, ADST_FLIPADST, V_ADST, |
| FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST, |
| H_DCT, H_ADST, H_FLIPADST, IDTX |
| }; |
| if (tx_set_type != EXT_TX_SET_ALL16 && |
| tx_set_type != EXT_TX_SET_DTT9_IDTX_1DDCT) |
| return 0; |
| const NN_CONFIG *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size]; |
| const NN_CONFIG *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size]; |
| if (!nn_config_hor || !nn_config_ver) return 0; // Model not established yet. |
| |
| aom_clear_system_state(); |
| float hfeatures[16], vfeatures[16]; |
| float hscores[4], vscores[4]; |
| float scores_2D[16]; |
| const int bw = tx_size_wide[tx_size]; |
| const int bh = tx_size_high[tx_size]; |
| const int hfeatures_num = bw <= 8 ? bw : bw / 2; |
| const int vfeatures_num = bh <= 8 ? bh : bh / 2; |
| assert(hfeatures_num <= 16); |
| assert(vfeatures_num <= 16); |
| |
| const struct macroblock_plane *const p = &x->plane[0]; |
| const int diff_stride = block_size_wide[bsize]; |
| const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col; |
| get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures, |
| vfeatures); |
| av1_get_horver_correlation_full(diff, diff_stride, bw, bh, |
| &hfeatures[hfeatures_num - 1], |
| &vfeatures[vfeatures_num - 1]); |
| av1_nn_predict(hfeatures, nn_config_hor, hscores); |
| av1_nn_predict(vfeatures, nn_config_ver, vscores); |
| aom_clear_system_state(); |
| |
| float score_2D_average = 0.0f; |
| for (int i = 0; i < 4; i++) { |
| float *cur_scores_2D = scores_2D + i * 4; |
| cur_scores_2D[0] = vscores[i] * hscores[0]; |
| cur_scores_2D[1] = vscores[i] * hscores[1]; |
| cur_scores_2D[2] = vscores[i] * hscores[2]; |
| cur_scores_2D[3] = vscores[i] * hscores[3]; |
| score_2D_average += cur_scores_2D[0] + cur_scores_2D[1] + cur_scores_2D[2] + |
| cur_scores_2D[3]; |
| } |
| score_2D_average /= 16; |
| |
| const int prune_aggr_table[2][2] = { { 6, 4 }, { 10, 7 } }; |
| int pruning_aggressiveness = 1; |
| if (tx_set_type == EXT_TX_SET_ALL16) { |
| score_2D_transform_pow8(scores_2D, (10 - score_2D_average)); |
| pruning_aggressiveness = |
| prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][0]; |
| } else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) { |
| score_2D_transform_pow8(scores_2D, (20 - score_2D_average)); |
| pruning_aggressiveness = |
| prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][1]; |
| } |
| |
| // Always keep the TX type with the highest score, prune all others with |
| // score below score_thresh. |
| int max_score_i = 0; |
| float max_score = 0.0f; |
| for (int i = 0; i < 16; i++) { |
| if (scores_2D[i] > max_score && |
| av1_ext_tx_used[tx_set_type][tx_type_table_2D[i]]) { |
| max_score = scores_2D[i]; |
| max_score_i = i; |
| } |
| } |
| |
| const float score_thresh = |
| prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness - 1]; |
| |
| uint16_t prune_bitmask = 0; |
| for (int i = 0; i < 16; i++) { |
| if (scores_2D[i] < score_thresh && i != max_score_i) |
| prune_bitmask |= (1 << tx_type_table_2D[i]); |
| } |
| return prune_bitmask; |
| } |
| |
| // ((prune >> vtx_tab[tx_type]) & 1) |
| static const uint16_t prune_v_mask[] = { |
| 0x0000, 0x0425, 0x108a, 0x14af, 0x4150, 0x4575, 0x51da, 0x55ff, |
| 0xaa00, 0xae25, 0xba8a, 0xbeaf, 0xeb50, 0xef75, 0xfbda, 0xffff, |
| }; |
| |
| // ((prune >> (htx_tab[tx_type] + 8)) & 1) |
| static const uint16_t prune_h_mask[] = { |
| 0x0000, 0x0813, 0x210c, 0x291f, 0x80e0, 0x88f3, 0xa1ec, 0xa9ff, |
| 0x5600, 0x5e13, 0x770c, 0x7f1f, 0xd6e0, 0xdef3, 0xf7ec, 0xffff, |
| }; |
| |
| static INLINE uint16_t gen_tx_search_prune_mask(int tx_search_prune) { |
| uint8_t prune_v = tx_search_prune & 0x0F; |
| uint8_t prune_h = (tx_search_prune >> 8) & 0x0F; |
| return (prune_v_mask[prune_v] & prune_h_mask[prune_h]); |
| } |
| |
| static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, |
| const MACROBLOCKD *const xd, int tx_set_type) { |
| x->tx_search_prune[tx_set_type] = 0; |
| x->tx_split_prune_flag = 0; |
| const MB_MODE_INFO *mbmi = xd->mi[0]; |
| const int is_inter = is_inter_block(mbmi); |
| if ((is_inter && cpi->oxcf.use_inter_dct_only) || |
| (!is_inter && cpi->oxcf.use_intra_dct_only)) { |
| x->tx_search_prune[tx_set_type] = ~(1 << DCT_DCT); |
| return; |
| } |
| if (!is_inter || cpi->sf.tx_type_search.prune_mode == NO_PRUNE || |
| x->use_default_inter_tx_type || xd->lossless[mbmi->segment_id] || |
| x->cb_partition_scan) |
| return; |
| int tx_set = ext_tx_set_index[1][tx_set_type]; |
| assert(tx_set >= 0); |
| const int *tx_set_1D = ext_tx_used_inter_1D[tx_set]; |
| int prune = 0; |
| switch (cpi->sf.tx_type_search.prune_mode) { |
| case NO_PRUNE: return; |
| case PRUNE_ONE: |
| if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) return; |
| prune = prune_one_for_sby(cpi, bsize, x, xd); |
| x->tx_search_prune[tx_set_type] = gen_tx_search_prune_mask(prune); |
| break; |
| case PRUNE_TWO: |
| if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) { |
| if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return; |
| prune = prune_two_for_sby(cpi, bsize, x, xd, 0, 1); |
| } else if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) { |
| prune = prune_two_for_sby(cpi, bsize, x, xd, 1, 0); |
| } else { |
| prune = prune_two_for_sby(cpi, bsize, x, xd, 1, 1); |
| } |
| x->tx_search_prune[tx_set_type] = gen_tx_search_prune_mask(prune); |
| break; |
| case PRUNE_2D_ACCURATE: |
| case PRUNE_2D_FAST: break; |
| default: assert(0); |
| } |
| } |
| |
| static void model_rd_from_sse(const AV1_COMP *const cpi, |
| const MACROBLOCK *const x, BLOCK_SIZE plane_bsize, |
| int plane, int64_t sse, int num_samples, |
| int *rate, int64_t *dist) { |
| (void)num_samples; |
| const MACROBLOCKD *const xd = &x->e_mbd; |
| const struct macroblockd_plane *const pd = &xd->plane[plane]; |
| const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; |
| |
| // Fast approximate the modelling function. |
| if (cpi->sf.simple_model_rd_from_var) { |
| const int64_t square_error = sse; |
| int quantizer = pd->dequant_Q3[1] >> dequant_shift; |
| if (quantizer < 120) |
| *rate = (int)AOMMIN( |
| (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT), |
| INT_MAX); |
| else |
| *rate = 0; |
| assert(*rate >= 0); |
| *dist = (square_error * quantizer) >> 8; |
| } else { |
| av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[plane_bsize], |
| pd->dequant_Q3[1] >> dequant_shift, rate, |
| dist); |
| } |
| *dist <<= 4; |
| } |
| |
| static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) { |
| const AV1_COMMON *cm = &cpi->common; |
| const int num_planes = av1_num_planes(cm); |
| const MACROBLOCKD *xd = &x->e_mbd; |
| const MB_MODE_INFO *mbmi = xd->mi[0]; |
| int64_t total_sse = 0; |
| for (int plane = 0; plane < num_planes; ++plane) { |
| const struct macroblock_plane *const p = &x->plane[plane]; |
| const struct macroblockd_plane *const pd = &xd->plane[plane]; |
| const BLOCK_SIZE bs = get_plane_block_size(mbmi->sb_type, pd->subsampling_x, |
| pd->subsampling_y); |
| unsigned int sse; |
| |
| if (x->skip_chroma_rd && plane) continue; |
| |
| cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, |
| &sse); |
| total_sse += sse; |
| } |
| total_sse <<= 4; |
| return total_sse; |
| } |
| |
| static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, |
| MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, |
| int plane_to, int mi_row, int mi_col, |
| int *out_rate_sum, int64_t *out_dist_sum, |
| int *skip_txfm_sb, int64_t *skip_sse_sb, |
| int *plane_rate, int64_t *plane_sse, |
| int64_t *plane_dist) { |
| // Note our transform coeffs are 8 times an orthogonal transform. |
| // Hence quantizer step is also 8 times. To get effective quantizer |
| // we need to divide by 8 before sending to modeling function. |
| int plane; |
| (void)mi_row; |
| (void)mi_col; |
| const int ref = xd->mi[0]->ref_frame[0]; |
| |
| int64_t rate_sum = 0; |
| int64_t dist_sum = 0; |
| int64_t total_sse = 0; |
| |
| assert(bsize < BLOCK_SIZES_ALL); |
| |
| for (plane = plane_from; plane <= plane_to; ++plane) { |
| struct macroblock_plane *const p = &x->plane[plane]; |
| struct macroblockd_plane *const pd = &xd->plane[plane]; |
| const BLOCK_SIZE plane_bsize = |
| get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); |
| assert(plane_bsize < BLOCK_SIZES_ALL); |
| const int bw = block_size_wide[plane_bsize]; |
| const int bh = block_size_high[plane_bsize]; |
| int64_t sse; |
| int rate; |
| int64_t dist; |
| |
| if (x->skip_chroma_rd && plane) continue; |
| |
| if (is_cur_buf_hbd(xd)) { |
| sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, |
| pd->dst.stride, bw, bh); |
| } else { |
| sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, |
| bh); |
| } |
| sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); |
| |
| model_rd_from_sse(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist); |
| |
| if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); |
| |
| total_sse += sse; |
| rate_sum += rate; |
| dist_sum += dist; |
| if (plane_rate) plane_rate[plane] = rate; |
| if (plane_sse) plane_sse[plane] = sse; |
| if (plane_dist) plane_dist[plane] = dist; |
| assert(rate_sum >= 0); |
| } |
| |
| if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; |
| if (skip_sse_sb) *skip_sse_sb = total_sse << 4; |
| rate_sum = AOMMIN(rate_sum, INT_MAX); |
| *out_rate_sum = (int)rate_sum; |
| *out_dist_sum = dist_sum; |
| } |
| |
| int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, |
| intptr_t block_size, int64_t *ssz) { |
| int i; |
| int64_t error = 0, sqcoeff = 0; |
| |
| for (i = 0; i < block_size; i++) { |
| const int diff = coeff[i] - dqcoeff[i]; |
| error += diff * diff; |
| sqcoeff += coeff[i] * coeff[i]; |
| } |
| |
| *ssz = sqcoeff; |
| return error; |
| } |
| |
| int64_t av1_highbd_block_error_c(const tran_low_t *coeff, |
| const tran_low_t *dqcoeff, intptr_t block_size, |
| int64_t *ssz, int bd) { |
| int i; |
| int64_t error = 0, sqcoeff = 0; |
| int shift = 2 * (bd - 8); |
| int rounding = shift > 0 ? 1 << (shift - 1) : 0; |
| |
| for (i = 0; i < block_size; i++) { |
| const int64_t diff = coeff[i] - dqcoeff[i]; |
| error += diff * diff; |
| sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i]; |
| } |
| assert(error >= 0 && sqcoeff >= 0); |
| error = (error + rounding) >> shift; |
| sqcoeff = (sqcoeff + rounding) >> shift; |
| |
| *ssz = sqcoeff; |
| return error; |
| } |
| |
| // Get transform block visible dimensions cropped to the MI units. |
| static void get_txb_dimensions(const MACROBLOCKD *xd, int plane, |
| BLOCK_SIZE plane_bsize, int blk_row, int blk_col, |
| BLOCK_SIZE tx_bsize, int *width, int *height, |
| int *visible_width, int *visible_height) { |
| assert(tx_bsize <= plane_bsize); |
| int txb_height = block_size_high[tx_bsize]; |
| int txb_width = block_size_wide[tx_bsize]; |
| const int block_height = block_size_high[plane_bsize]; |
| const int block_width = block_size_wide[plane_bsize]; |
| const struct macroblockd_plane *const pd = &xd->plane[plane]; |
| // TODO(aconverse@google.com): Investigate using crop_width/height here rather |
| // than the MI size |
| const int block_rows = |
| (xd->mb_to_bottom_edge >= 0) |
| ? block_height |
| : (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + block_height; |
| const int block_cols = |
| (xd->mb_to_right_edge >= 0) |
| ? block_width |
| : (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + block_width; |
| const int tx_unit_size = tx_size_wide_log2[0]; |
| if (width) *width = txb_width; |
| if (height) *height = txb_height; |
| *visible_width = clamp(block_cols - (blk_col << tx_unit_size), 0, txb_width); |
| *visible_height = |
| clamp(block_rows - (blk_row << tx_unit_size), 0, txb_height); |
| } |
| |
| // Compute the pixel domain distortion from src and dst on all visible 4x4s in |
| // the |
| // transform block. |
| static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x, |
| int plane, const uint8_t *src, const int src_stride, |
| const uint8_t *dst, const int dst_stride, |
| int blk_row, int blk_col, |
| const BLOCK_SIZE plane_bsize, |
| const BLOCK_SIZE tx_bsize) { |
| int txb_rows, txb_cols, visible_rows, visible_cols; |
| const MACROBLOCKD *xd = &x->e_mbd; |
| |
| get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, |
| &txb_cols, &txb_rows, &visible_cols, &visible_rows); |
| assert(visible_rows > 0); |
| assert(visible_cols > 0); |
| |
| #if CONFIG_DIST_8X8 |
| if (x->using_dist_8x8 && plane == 0) |
| return (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, |
| tx_bsize, txb_cols, txb_rows, visible_cols, |
| visible_rows, x->qindex); |
| #endif // CONFIG_DIST_8X8 |
| |
| unsigned sse = pixel_dist_visible_only(cpi, x, src, src_stride, dst, |
| dst_stride, tx_bsize, txb_rows, |
| txb_cols, visible_rows, visible_cols); |
| |
| return sse; |
| } |
| |
| // Compute the pixel domain distortion from diff on all visible 4x4s in the |
| // transform block. |
| static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane, |
| int blk_row, int blk_col, |
| const BLOCK_SIZE plane_bsize, |
| const BLOCK_SIZE tx_bsize, |
| unsigned int *block_mse_q8) { |
| int visible_rows, visible_cols; |
| const MACROBLOCKD *xd = &x->e_mbd; |
| get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL, |
| NULL, &visible_cols, &visible_rows); |
| const int diff_stride = block_size_wide[plane_bsize]; |
| const int16_t *diff = x->plane[plane].src_diff; |
| #if CONFIG_DIST_8X8 |
| int txb_height = block_size_high[tx_bsize]; |
| int txb_width = block_size_wide[tx_bsize]; |
| if (x->using_dist_8x8 && plane == 0) { |
| const int src_stride = x->plane[plane].src.stride; |
| const int src_idx = (blk_row * src_stride + blk_col) |
| << tx_size_wide_log2[0]; |
| const int diff_idx = (blk_row * diff_stride + blk_col) |
| << tx_size_wide_log2[0]; |
| const uint8_t *src = &x->plane[plane].src.buf[src_idx]; |
| return dist_8x8_diff(x, src, src_stride, diff + diff_idx, diff_stride, |
| txb_width, txb_height, visible_cols, visible_rows, |
| x->qindex); |
| } |
| #endif |
| diff += ((blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]); |
| uint64_t sse = |
| aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows); |
| if (block_mse_q8 != NULL) { |
| if (visible_cols > 0 && visible_rows > 0) |
| *block_mse_q8 = |
| (unsigned int)((256 * sse) / (visible_cols * visible_rows)); |
| else |
| *block_mse_q8 = UINT_MAX; |
| } |
| return sse; |
| } |
| |
| int av1_count_colors(const uint8_t *src, int stride, int rows, int cols, |
| int *val_count) { |
| const int max_pix_val = 1 << 8; |
| memset(val_count, 0, max_pix_val * sizeof(val_count[0])); |
| for (int r = 0; r < rows; ++r) { |
| for (int c = 0; c < cols; ++c) { |
| const int this_val = src[r * stride + c]; |
| assert(this_val < max_pix_val); |
| ++val_count[this_val]; |
| } |
| } |
| int n = 0; |
| for (int i = 0; i < max_pix_val; ++i) { |
| if (val_count[i]) ++n; |
| } |
| return n; |
| } |
| |
| int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols, |
| int bit_depth, int *val_count) { |
| assert(bit_depth <= 12); |
| const int max_pix_val = 1 << bit_depth; |
| const uint16_t *src = CONVERT_TO_SHORTPTR(src8); |
| memset(val_count, 0, max_pix_val * sizeof(val_count[0])); |
| for (int r = 0; r < rows; ++r) { |
| for (int c = 0; c < cols; ++c) { |
| const int this_val = src[r * stride + c]; |
| assert(this_val < max_pix_val); |
| if (this_val >= max_pix_val) return 0; |
| ++val_count[this_val]; |
| } |
| } |
| int n = 0; |
| for (int i = 0; i < max_pix_val; ++i) { |
| if (val_count[i]) ++n; |
| } |
| return n; |
| } |
| |
| static void inverse_transform_block_facade(MACROBLOCKD *xd, int plane, |
| int block, int blk_row, int blk_col, |
| int eob, int reduced_tx_set) { |
| struct macroblockd_plane *const pd = &xd->plane[plane]; |
| tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); |
| const PLANE_TYPE plane_type = get_plane_type(plane); |
| const TX_SIZE tx_size = av1_get_tx_size(plane, xd); |
| const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, |
| tx_size, reduced_tx_set); |
| const int dst_stride = pd->dst.stride; |
| uint8_t *dst = |
| &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]]; |
| av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, |
| dst_stride, eob, reduced_tx_set); |
| } |
| |
| static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record, const uint32_t hash); |
| |
| static uint32_t get_intra_txb_hash(MACROBLOCK *x, int plane, int blk_row, |
| int blk_col, BLOCK_SIZE plane_bsize, |
| TX_SIZE tx_size) { |
| int16_t tmp_data[64 * 64]; |
| const int diff_stride = block_size_wide[plane_bsize]; |
| const int16_t *diff = x->plane[plane].src_diff; |
| const int16_t *cur_diff_row = diff + 4 * blk_row * diff_stride + 4 * blk_col; |
| const int txb_w = tx_size_wide[tx_size]; |
| const int txb_h = tx_size_high[tx_size]; |
| uint8_t *hash_data = (uint8_t *)cur_diff_row; |
| if (txb_w != diff_stride) { |
| int16_t *cur_hash_row = tmp_data; |
| for (int i = 0; i < txb_h; i++) { |
| memcpy(cur_hash_row, cur_diff_row, sizeof(*diff) * txb_w); |
| cur_hash_row += txb_w; |
| cur_diff_row += diff_stride; |
| } |
| hash_data = (uint8_t *)tmp_data; |
| } |
| CRC32C *crc = &x->mb_rd_record.crc_calculator; |
| const uint32_t hash = av1_get_crc32c_value(crc, hash_data, 2 * txb_w * txb_h); |
| return (hash << 5) + tx_size; |
| } |
| |
| static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block, |
| TX_SIZE tx_size, int64_t *out_dist, |
| int64_t *out_sse) { |
| MACROBLOCKD *const xd = &x->e_mbd; |
| const struct macroblock_plane *const p = &x->plane[plane]; |
| const struct macroblockd_plane *const pd = &xd->plane[plane]; |
| // Transform domain distortion computation is more efficient as it does |
| // not involve an inverse transform, but it is less accurate. |
| const int buffer_length = av1_get_max_eob(tx_size); |
| int64_t this_sse; |
| // TX-domain results need to shift down to Q2/D10 to match pixel |
| // domain distortion values which are in Q2^2 |
| int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2; |
| tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); |
| tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); |
| |
| if (is_cur_buf_hbd(xd)) |
| *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse, |
| xd->bd); |
| else |
| *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse); |
| |
| *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift); |
| *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift); |
| } |
| |
| static INLIN
|