| /* |
| * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
| * |
| * This source code is subject to the terms of the BSD 2 Clause License and |
| * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| * was not distributed with this source code in the LICENSE file, you can |
| * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| * Media Patent License 1.0 was not distributed with this source code in the |
| * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| */ |
| |
| #include <assert.h> |
| #include <math.h> |
| |
| #include "config/aom_dsp_rtcd.h" |
| #include "config/av1_rtcd.h" |
| |
| #include "aom_dsp/aom_dsp_common.h" |
| #include "aom_dsp/blend.h" |
| #include "aom_mem/aom_mem.h" |
| #include "aom_ports/aom_timer.h" |
| #include "aom_ports/mem.h" |
| #include "aom_ports/system_state.h" |
| |
| #include "av1/common/cfl.h" |
| #include "av1/common/common.h" |
| #include "av1/common/common_data.h" |
| #include "av1/common/entropy.h" |
| #include "av1/common/entropymode.h" |
| #include "av1/common/idct.h" |
| #include "av1/common/mvref_common.h" |
| #include "av1/common/obmc.h" |
| #include "av1/common/pred_common.h" |
| #include "av1/common/quant_common.h" |
| #include "av1/common/reconinter.h" |
| #include "av1/common/reconintra.h" |
| #include "av1/common/scan.h" |
| #include "av1/common/seg_common.h" |
| #include "av1/common/txb_common.h" |
| #include "av1/common/warped_motion.h" |
| |
| #include "av1/encoder/aq_variance.h" |
| #include "av1/encoder/av1_quantize.h" |
| #include "av1/encoder/cost.h" |
| #include "av1/encoder/encodemb.h" |
| #include "av1/encoder/encodemv.h" |
| #include "av1/encoder/encoder.h" |
| #include "av1/encoder/encodetxb.h" |
| #include "av1/encoder/hybrid_fwd_txfm.h" |
| #include "av1/encoder/mcomp.h" |
| #include "av1/encoder/ml.h" |
| #include "av1/encoder/palette.h" |
| #include "av1/encoder/pustats.h" |
| #include "av1/encoder/random.h" |
| #include "av1/encoder/ratectrl.h" |
| #include "av1/encoder/rd.h" |
| #include "av1/encoder/rdopt.h" |
| #include "av1/encoder/tokenize.h" |
| #include "av1/encoder/tx_prune_model_weights.h" |
| |
| // Set this macro as 1 to collect data about tx size selection. |
| #define COLLECT_TX_SIZE_DATA 0 |
| |
| #if COLLECT_TX_SIZE_DATA |
| static const char av1_tx_size_data_output_file[] = "tx_size_data.txt"; |
| #endif |
| |
| #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS) |
| static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = { |
| 0x00000000, 0x00010000, 0x00020000, // y = 0 |
| 0x00000001, 0x00010001, 0x00020001, // y = 1 |
| 0x00000002, 0x00010002, 0x00020002, // y = 2 |
| }; |
| |
| #define SECOND_REF_FRAME_MASK \ |
| ((1 << ALTREF_FRAME) | (1 << ALTREF2_FRAME) | (1 << BWDREF_FRAME) | \ |
| (1 << GOLDEN_FRAME) | (1 << LAST2_FRAME) | 0x01) |
| |
| #define ANGLE_SKIP_THRESH 10 |
| |
| static const double ADST_FLIP_SVM[8] = { |
| /* vertical */ |
| -6.6623, -2.8062, -3.2531, 3.1671, |
| /* horizontal */ |
| -7.7051, -3.2234, -3.6193, 3.4533 |
| }; |
| |
| typedef struct { |
| PREDICTION_MODE mode; |
| MV_REFERENCE_FRAME ref_frame[2]; |
| } MODE_DEFINITION; |
| |
| typedef struct { |
| MV_REFERENCE_FRAME ref_frame[2]; |
| } REF_DEFINITION; |
| |
| typedef enum { |
| FTXS_NONE = 0, |
| FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0, |
| FTXS_DISABLE_TRELLIS_OPT = 1 << 1, |
| FTXS_USE_TRANSFORM_DOMAIN = 1 << 2 |
| } FAST_TX_SEARCH_MODE; |
| |
| struct rdcost_block_args { |
| const AV1_COMP *cpi; |
| MACROBLOCK *x; |
| ENTROPY_CONTEXT t_above[MAX_MIB_SIZE]; |
| ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]; |
| RD_STATS rd_stats; |
| int64_t this_rd; |
| int64_t best_rd; |
| int exit_early; |
| int use_fast_coef_costing; |
| FAST_TX_SEARCH_MODE ftxs_mode; |
| }; |
| |
| #define LAST_NEW_MV_INDEX 6 |
| static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { |
| { NEARESTMV, { LAST_FRAME, NONE_FRAME } }, |
| { NEARESTMV, { LAST2_FRAME, NONE_FRAME } }, |
| { NEARESTMV, { LAST3_FRAME, NONE_FRAME } }, |
| { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } }, |
| { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } }, |
| { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } }, |
| { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } }, |
| |
| { DC_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| |
| { NEWMV, { LAST_FRAME, NONE_FRAME } }, |
| { NEWMV, { LAST2_FRAME, NONE_FRAME } }, |
| { NEWMV, { LAST3_FRAME, NONE_FRAME } }, |
| { NEWMV, { BWDREF_FRAME, NONE_FRAME } }, |
| { NEWMV, { ALTREF2_FRAME, NONE_FRAME } }, |
| { NEWMV, { ALTREF_FRAME, NONE_FRAME } }, |
| { NEWMV, { GOLDEN_FRAME, NONE_FRAME } }, |
| |
| { NEARMV, { LAST_FRAME, NONE_FRAME } }, |
| { NEARMV, { LAST2_FRAME, NONE_FRAME } }, |
| { NEARMV, { LAST3_FRAME, NONE_FRAME } }, |
| { NEARMV, { BWDREF_FRAME, NONE_FRAME } }, |
| { NEARMV, { ALTREF2_FRAME, NONE_FRAME } }, |
| { NEARMV, { ALTREF_FRAME, NONE_FRAME } }, |
| { NEARMV, { GOLDEN_FRAME, NONE_FRAME } }, |
| |
| { GLOBALMV, { LAST_FRAME, NONE_FRAME } }, |
| { GLOBALMV, { LAST2_FRAME, NONE_FRAME } }, |
| { GLOBALMV, { LAST3_FRAME, NONE_FRAME } }, |
| { GLOBALMV, { BWDREF_FRAME, NONE_FRAME } }, |
| { GLOBALMV, { ALTREF2_FRAME, NONE_FRAME } }, |
| { GLOBALMV, { GOLDEN_FRAME, NONE_FRAME } }, |
| { GLOBALMV, { ALTREF_FRAME, NONE_FRAME } }, |
| |
| // TODO(zoeliu): May need to reconsider the order on the modes to check |
| |
| { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } }, |
| { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } }, |
| { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } }, |
| { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, |
| |
| { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } }, |
| { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } }, |
| { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } }, |
| |
| { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| |
| { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, |
| { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, |
| { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, |
| { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } }, |
| { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } }, |
| { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, |
| { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } }, |
| { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } }, |
| { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, |
| { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } }, |
| |
| { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, |
| { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } }, |
| { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, |
| { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, |
| { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } }, |
| { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } }, |
| { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, |
| { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } }, |
| { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, |
| { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } }, |
| { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } }, |
| { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, |
| { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } }, |
| { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, |
| { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST2_FRAME, BWDREF_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } }, |
| { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } }, |
| { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, |
| { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } }, |
| { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, |
| { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST3_FRAME, BWDREF_FRAME } }, |
| |
| { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } }, |
| { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } }, |
| { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, |
| { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } }, |
| { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, |
| { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, |
| { GLOBAL_GLOBALMV, { GOLDEN_FRAME, BWDREF_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } }, |
| { NEAREST_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } }, |
| { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF2_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } }, |
| { NEAREST_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } }, |
| { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF2_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } }, |
| { NEAREST_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } }, |
| { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF2_FRAME } }, |
| |
| { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, |
| { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, |
| { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, |
| { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, |
| { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, |
| |
| { H_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { V_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { D135_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { D203_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { D157_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { D67_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { D113_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| { D45_PRED, { INTRA_FRAME, NONE_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } }, |
| { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } }, |
| { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } }, |
| { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } }, |
| { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } }, |
| { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST_FRAME, LAST2_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } }, |
| { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } }, |
| { NEAREST_NEWMV, { LAST_FRAME, LAST3_FRAME } }, |
| { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } }, |
| { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } }, |
| { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST_FRAME, LAST3_FRAME } }, |
| |
| { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } }, |
| { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } }, |
| { NEAREST_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, |
| { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } }, |
| { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, |
| { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, |
| { GLOBAL_GLOBALMV, { LAST_FRAME, GOLDEN_FRAME } }, |
| |
| { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } }, |
| { NEAREST_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, |
| { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } }, |
| { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, |
| { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, |
| { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } }, |
| }; |
| |
| static const int16_t intra_to_mode_idx[INTRA_MODE_NUM] = { |
| 7, // DC_PRED, |
| 134, // V_PRED, |
| 133, // H_PRED, |
| 140, // D45_PRED, |
| 135, // D135_PRED, |
| 139, // D113_PRED, |
| 137, // D157_PRED, |
| 136, // D203_PRED, |
| 138, // D67_PRED, |
| 46, // SMOOTH_PRED, |
| 47, // SMOOTH_V_PRED, |
| 48, // SMOOTH_H_PRED, |
| 45, // PAETH_PRED, |
| }; |
| |
| /* clang-format off */ |
| static const int16_t single_inter_to_mode_idx[SINGLE_INTER_MODE_NUM] |
| [REF_FRAMES] = { |
| // NEARESTMV, |
| { -1, 0, 1, 2, 6, 3, 4, 5, }, |
| // NEARMV, |
| { -1, 15, 16, 17, 21, 18, 19, 20, }, |
| // GLOBALMV, |
| { -1, 22, 23, 24, 27, 25, 26, 28, }, |
| // NEWMV, |
| { -1, 8, 9, 10, 14, 11, 12, 13, }, |
| }; |
| /* clang-format on */ |
| |
| /* clang-format off */ |
| static const int16_t comp_inter_to_mode_idx[COMP_INTER_MODE_NUM][REF_FRAMES] |
| [REF_FRAMES] = { |
| // NEAREST_NEARESTMV, |
| { |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, 41, 42, 43, 33, 37, 29, }, |
| { -1, -1, -1, -1, -1, 34, 38, 30, }, |
| { -1, -1, -1, -1, -1, 35, 39, 31, }, |
| { -1, -1, -1, -1, -1, 36, 40, 32, }, |
| { -1, -1, -1, -1, -1, -1, -1, 44, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| }, |
| // NEAR_NEARMV, |
| { |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, 141, 148, 155, 77, 105, 49, }, |
| { -1, -1, -1, -1, -1, 84, 112, 56, }, |
| { -1, -1, -1, -1, -1, 91, 119, 63, }, |
| { -1, -1, -1, -1, -1, 98, 126, 70, }, |
| { -1, -1, -1, -1, -1, -1, -1, 162, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| }, |
| // NEAREST_NEWMV, |
| { |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, 143, 150, 157, 79, 107, 51, }, |
| { -1, -1, -1, -1, -1, 86, 114, 58, }, |
| { -1, -1, -1, -1, -1, 93, 121, 65, }, |
| { -1, -1, -1, -1, -1, 100, 128, 72, }, |
| { -1, -1, -1, -1, -1, -1, -1, 164, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| }, |
| // NEW_NEARESTMV, |
| { |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, 142, 149, 156, 78, 106, 50, }, |
| { -1, -1, -1, -1, -1, 85, 113, 57, }, |
| { -1, -1, -1, -1, -1, 92, 120, 64, }, |
| { -1, -1, -1, -1, -1, 99, 127, 71, }, |
| { -1, -1, -1, -1, -1, -1, -1, 163, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| }, |
| // NEAR_NEWMV, |
| { |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, 145, 152, 159, 81, 109, 53, }, |
| { -1, -1, -1, -1, -1, 88, 116, 60, }, |
| { -1, -1, -1, -1, -1, 95, 123, 67, }, |
| { -1, -1, -1, -1, -1, 102, 130, 74, }, |
| { -1, -1, -1, -1, -1, -1, -1, 166, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| }, |
| // NEW_NEARMV, |
| { |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, 144, 151, 158, 80, 108, 52, }, |
| { -1, -1, -1, -1, -1, 87, 115, 59, }, |
| { -1, -1, -1, -1, -1, 94, 122, 66, }, |
| { -1, -1, -1, -1, -1, 101, 129, 73, }, |
| { -1, -1, -1, -1, -1, -1, -1, 165, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| }, |
| // GLOBAL_GLOBALMV, |
| { |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, 147, 154, 161, 83, 111, 55, }, |
| { -1, -1, -1, -1, -1, 90, 118, 62, }, |
| { -1, -1, -1, -1, -1, 97, 125, 69, }, |
| { -1, -1, -1, -1, -1, 104, 132, 76, }, |
| { -1, -1, -1, -1, -1, -1, -1, 168, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| }, |
| // NEW_NEWMV, |
| { |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, 146, 153, 160, 82, 110, 54, }, |
| { -1, -1, -1, -1, -1, 89, 117, 61, }, |
| { -1, -1, -1, -1, -1, 96, 124, 68, }, |
| { -1, -1, -1, -1, -1, 103, 131, 75, }, |
| { -1, -1, -1, -1, -1, -1, -1, 167, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| { -1, -1, -1, -1, -1, -1, -1, -1, }, |
| }, |
| }; |
| /* clang-format on */ |
| |
| static int get_prediction_mode_idx(PREDICTION_MODE this_mode, |
| MV_REFERENCE_FRAME ref_frame, |
| MV_REFERENCE_FRAME second_ref_frame) { |
| if (this_mode < INTRA_MODE_END) { |
| assert(ref_frame == INTRA_FRAME); |
| assert(second_ref_frame == NONE_FRAME); |
| return intra_to_mode_idx[this_mode - INTRA_MODE_START]; |
| } |
| if (this_mode >= SINGLE_INTER_MODE_START && |
| this_mode < SINGLE_INTER_MODE_END) { |
| assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME)); |
| assert(second_ref_frame == NONE_FRAME); |
| return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START] |
| [ref_frame]; |
| } |
| if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END) { |
| assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME)); |
| assert((second_ref_frame > INTRA_FRAME) && |
| (second_ref_frame <= ALTREF_FRAME)); |
| return comp_inter_to_mode_idx[this_mode - COMP_INTER_MODE_START][ref_frame] |
| [second_ref_frame]; |
| } |
| assert(0); |
| return -1; |
| } |
| |
| static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = { |
| DC_PRED, H_PRED, V_PRED, SMOOTH_PRED, PAETH_PRED, |
| SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED, D157_PRED, |
| D67_PRED, D113_PRED, D45_PRED, |
| }; |
| |
| static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = { |
| UV_DC_PRED, UV_CFL_PRED, UV_H_PRED, UV_V_PRED, |
| UV_SMOOTH_PRED, UV_PAETH_PRED, UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED, |
| UV_D135_PRED, UV_D203_PRED, UV_D157_PRED, UV_D67_PRED, |
| UV_D113_PRED, UV_D45_PRED, |
| }; |
| |
| typedef struct InterModeSearchState { |
| int64_t best_rd; |
| MB_MODE_INFO best_mbmode; |
| int best_rate_y; |
| int best_rate_uv; |
| int best_mode_skippable; |
| int best_skip2; |
| int best_mode_index; |
| int skip_intra_modes; |
| int num_available_refs; |
| int64_t dist_refs[REF_FRAMES]; |
| int dist_order_refs[REF_FRAMES]; |
| int64_t mode_threshold[MAX_MODES]; |
| PREDICTION_MODE best_intra_mode; |
| int64_t best_intra_rd; |
| int angle_stats_ready; |
| uint8_t directional_mode_skip_mask[INTRA_MODES]; |
| unsigned int best_pred_sse; |
| int rate_uv_intra[TX_SIZES_ALL]; |
| int rate_uv_tokenonly[TX_SIZES_ALL]; |
| int64_t dist_uvs[TX_SIZES_ALL]; |
| int skip_uvs[TX_SIZES_ALL]; |
| UV_PREDICTION_MODE mode_uv[TX_SIZES_ALL]; |
| PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL]; |
| int8_t uv_angle_delta[TX_SIZES_ALL]; |
| int64_t best_pred_rd[REFERENCE_MODES]; |
| int64_t best_pred_diff[REFERENCE_MODES]; |
| // Save a set of single_newmv for each checked ref_mv. |
| int_mv single_newmv[MAX_REF_MV_SERCH][REF_FRAMES]; |
| int single_newmv_rate[MAX_REF_MV_SERCH][REF_FRAMES]; |
| int single_newmv_valid[MAX_REF_MV_SERCH][REF_FRAMES]; |
| int64_t modelled_rd[MB_MODE_COUNT][REF_FRAMES]; |
| } InterModeSearchState; |
| |
| #if CONFIG_COLLECT_INTER_MODE_RD_STATS |
| |
| typedef struct InterModeRdModel { |
| int ready; |
| double a; |
| double b; |
| double dist_mean; |
| int skip_count; |
| int non_skip_count; |
| int fp_skip_count; |
| int bracket_idx; |
| } InterModeRdModel; |
| |
| InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL]; |
| |
| #define INTER_MODE_RD_DATA_OVERALL_SIZE 6400 |
| static int inter_mode_data_idx[4]; |
| static int64_t inter_mode_data_sse[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; |
| static int64_t inter_mode_data_dist[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; |
| static int inter_mode_data_residue_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; |
| static int inter_mode_data_all_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; |
| static int64_t inter_mode_data_ref_best_rd[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; |
| |
| int inter_mode_data_block_idx(BLOCK_SIZE bsize) { |
| if (bsize == BLOCK_8X8) return 1; |
| if (bsize == BLOCK_16X16) return 2; |
| if (bsize == BLOCK_32X32) return 3; |
| return -1; |
| } |
| |
| void av1_inter_mode_data_init() { |
| for (int i = 0; i < BLOCK_SIZES_ALL; ++i) { |
| const int block_idx = inter_mode_data_block_idx(i); |
| if (block_idx != -1) inter_mode_data_idx[block_idx] = 0; |
| InterModeRdModel *md = &inter_mode_rd_models[i]; |
| md->ready = 0; |
| md->skip_count = 0; |
| md->non_skip_count = 0; |
| md->fp_skip_count = 0; |
| md->bracket_idx = 0; |
| } |
| } |
| |
| void av1_inter_mode_data_show(const AV1_COMMON *cm) { |
| printf("frame_offset %d\n", cm->frame_offset); |
| for (int i = 0; i < BLOCK_SIZES_ALL; ++i) { |
| const int block_idx = inter_mode_data_block_idx(i); |
| if (block_idx != -1) inter_mode_data_idx[block_idx] = 0; |
| InterModeRdModel *md = &inter_mode_rd_models[i]; |
| if (md->ready) { |
| printf("bsize %d non_skip_count %d skip_count %d fp_skip_count %d\n", i, |
| md->non_skip_count, md->skip_count, md->fp_skip_count); |
| } |
| } |
| } |
| |
| static int64_t get_est_rd(BLOCK_SIZE bsize, int rdmult, int64_t sse, |
| int curr_cost) { |
| aom_clear_system_state(); |
| InterModeRdModel *md = &inter_mode_rd_models[bsize]; |
| if (md->ready) { |
| const double est_ld = md->a * sse + md->b; |
| const double est_residue_cost = (sse - md->dist_mean) / est_ld; |
| const int64_t est_cost = (int64_t)round(est_residue_cost) + curr_cost; |
| const int64_t int64_dist_mean = (int64_t)round(md->dist_mean); |
| const int64_t est_rd = RDCOST(rdmult, est_cost, int64_dist_mean); |
| return est_rd; |
| } |
| return 0; |
| } |
| |
| #define DATA_BRACKETS 7 |
| static const int data_num_threshold[DATA_BRACKETS] = { |
| 200, 400, 800, 1600, 3200, 6400, INT32_MAX |
| }; |
| |
| void av1_inter_mode_data_fit(int rdmult) { |
| aom_clear_system_state(); |
| for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { |
| const int block_idx = inter_mode_data_block_idx(bsize); |
| InterModeRdModel *md = &inter_mode_rd_models[bsize]; |
| if (block_idx == -1) continue; |
| int data_num = inter_mode_data_idx[block_idx]; |
| if (data_num < data_num_threshold[md->bracket_idx]) { |
| continue; |
| } |
| double my = 0; |
| double mx = 0; |
| double dx = 0; |
| double dxy = 0; |
| double dist_mean = 0; |
| const int train_num = data_num; |
| for (int i = 0; i < train_num; ++i) { |
| const double sse = (double)inter_mode_data_sse[block_idx][i]; |
| const double dist = (double)inter_mode_data_dist[block_idx][i]; |
| const double residue_cost = inter_mode_data_residue_cost[block_idx][i]; |
| const double ld = (sse - dist) / residue_cost; |
| dist_mean += dist; |
| my += ld; |
| mx += sse; |
| dx += sse * sse; |
| dxy += sse * ld; |
| } |
| dist_mean = dist_mean / data_num; |
| my = my / train_num; |
| mx = mx / train_num; |
| dx = sqrt(dx / train_num); |
| dxy = dxy / train_num; |
| |
| md->dist_mean = dist_mean; |
| md->a = (dxy - mx * my) / (dx * dx - mx * mx); |
| md->b = my - md->a * mx; |
| ++md->bracket_idx; |
| md->ready = 1; |
| assert(md->bracket_idx < DATA_BRACKETS); |
| |
| (void)rdmult; |
| #if 0 |
| int skip_count = 0; |
| int fp_skip_count = 0; |
| double avg_error = 0; |
| const int test_num = data_num; |
| for (int i = 0; i < data_num; ++i) { |
| const int64_t sse = inter_mode_data_sse[block_idx][i]; |
| const int64_t dist = inter_mode_data_dist[block_idx][i]; |
| const int64_t residue_cost = inter_mode_data_residue_cost[block_idx][i]; |
| const int64_t all_cost = inter_mode_data_all_cost[block_idx][i]; |
| const int64_t est_rd = |
| get_est_rd(bsize, rdmult, sse, all_cost - residue_cost); |
| const int64_t real_rd = RDCOST(rdmult, all_cost, dist); |
| const int64_t ref_best_rd = inter_mode_data_ref_best_rd[block_idx][i]; |
| if (est_rd > ref_best_rd) { |
| ++skip_count; |
| if (real_rd < ref_best_rd) { |
| ++fp_skip_count; |
| } |
| } |
| avg_error += abs(est_rd - real_rd) * 100. / real_rd; |
| } |
| avg_error /= test_num; |
| printf("test_num %d bsize %d avg_error %f skip_count %d fp_skip_count %d\n", |
| test_num, bsize, avg_error, skip_count, fp_skip_count); |
| #endif |
| } |
| } |
| |
| static void inter_mode_data_push(BLOCK_SIZE bsize, int64_t sse, int64_t dist, |
| int residue_cost, int all_cost, |
| int64_t ref_best_rd) { |
| if (residue_cost == 0 || sse == dist) return; |
| const int block_idx = inter_mode_data_block_idx(bsize); |
| if (block_idx == -1) return; |
| if (inter_mode_data_idx[block_idx] < INTER_MODE_RD_DATA_OVERALL_SIZE) { |
| const int data_idx = inter_mode_data_idx[block_idx]; |
| inter_mode_data_sse[block_idx][data_idx] = sse; |
| inter_mode_data_dist[block_idx][data_idx] = dist; |
| inter_mode_data_residue_cost[block_idx][data_idx] = residue_cost; |
| inter_mode_data_all_cost[block_idx][data_idx] = all_cost; |
| inter_mode_data_ref_best_rd[block_idx][data_idx] = ref_best_rd; |
| ++inter_mode_data_idx[block_idx]; |
| } |
| } |
| #endif // CONFIG_COLLECT_INTER_MODE_RD_STATS |
| |
| static INLINE int write_uniform_cost(int n, int v) { |
| const int l = get_unsigned_bits(n); |
| const int m = (1 << l) - n; |
| if (l == 0) return 0; |
| if (v < m) |
| return av1_cost_literal(l - 1); |
| else |
| return av1_cost_literal(l); |
| } |
| |
| // Similar to store_cfl_required(), but for use during the RDO process, |
| // where we haven't yet determined whether this block uses CfL. |
| static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm, |
| const MACROBLOCK *x) { |
| const MACROBLOCKD *xd = &x->e_mbd; |
| |
| if (cm->seq_params.monochrome || x->skip_chroma_rd) return CFL_DISALLOWED; |
| |
| if (!xd->cfl.is_chroma_reference) { |
| // For non-chroma-reference blocks, we should always store the luma pixels, |
| // in case the corresponding chroma-reference block uses CfL. |
| // Note that this can only happen for block sizes which are <8 on |
| // their shortest side, as otherwise they would be chroma reference |
| // blocks. |
| return CFL_ALLOWED; |
| } |
| |
| // For chroma reference blocks, we should store data in the encoder iff we're |
| // allowed to try out CfL. |
| return is_cfl_allowed(xd); |
| } |
| |
| // constants for prune 1 and prune 2 decision boundaries |
| #define FAST_EXT_TX_CORR_MID 0.0 |
| #define FAST_EXT_TX_EDST_MID 0.1 |
| #define FAST_EXT_TX_CORR_MARGIN 0.5 |
| #define FAST_EXT_TX_EDST_MARGIN 0.3 |
| |
| static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, |
| RD_STATS *rd_stats, BLOCK_SIZE bsize, |
| int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode); |
| |
| static unsigned pixel_dist_visible_only( |
| const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src, |
| const int src_stride, const uint8_t *dst, const int dst_stride, |
| const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows, |
| int visible_cols) { |
| unsigned sse; |
| |
| if (txb_rows == visible_rows && txb_cols == visible_cols) { |
| cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse); |
| return sse; |
| } |
| const MACROBLOCKD *xd = &x->e_mbd; |
| |
| if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { |
| uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride, |
| visible_cols, visible_rows); |
| return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2); |
| } |
| sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols, |
| visible_rows); |
| return sse; |
| } |
| |
| #if CONFIG_DIST_8X8 |
| static uint64_t cdef_dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src, |
| int sstride, int coeff_shift) { |
| uint64_t svar = 0; |
| uint64_t dvar = 0; |
| uint64_t sum_s = 0; |
| uint64_t sum_d = 0; |
| uint64_t sum_s2 = 0; |
| uint64_t sum_d2 = 0; |
| uint64_t sum_sd = 0; |
| uint64_t dist = 0; |
| |
| int i, j; |
| for (i = 0; i < 8; i++) { |
| for (j = 0; j < 8; j++) { |
| sum_s += src[i * sstride + j]; |
| sum_d += dst[i * dstride + j]; |
| sum_s2 += src[i * sstride + j] * src[i * sstride + j]; |
| sum_d2 += dst[i * dstride + j] * dst[i * dstride + j]; |
| sum_sd += src[i * sstride + j] * dst[i * dstride + j]; |
| } |
| } |
| /* Compute the variance -- the calculation cannot go negative. */ |
| svar = sum_s2 - ((sum_s * sum_s + 32) >> 6); |
| dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6); |
| |
| // Tuning of jm's original dering distortion metric used in CDEF tool, |
| // suggested by jm |
| const uint64_t a = 4; |
| const uint64_t b = 2; |
| const uint64_t c1 = (400 * a << 2 * coeff_shift); |
| const uint64_t c2 = (b * 20000 * a * a << 4 * coeff_shift); |
| |
| dist = (uint64_t)floor(.5 + (sum_d2 + sum_s2 - 2 * sum_sd) * .5 * |
| (svar + dvar + c1) / |
| (sqrt(svar * (double)dvar + c2))); |
| |
| // Calibrate dist to have similar rate for the same QP with MSE only |
| // distortion (as in master branch) |
| dist = (uint64_t)((float)dist * 0.75); |
| |
| return dist; |
| } |
| |
| static int od_compute_var_4x4(uint16_t *x, int stride) { |
| int sum; |
| int s2; |
| int i; |
| sum = 0; |
| s2 = 0; |
| for (i = 0; i < 4; i++) { |
| int j; |
| for (j = 0; j < 4; j++) { |
| int t; |
| |
| t = x[i * stride + j]; |
| sum += t; |
| s2 += t * t; |
| } |
| } |
| |
| return (s2 - (sum * sum >> 4)) >> 4; |
| } |
| |
| /* OD_DIST_LP_MID controls the frequency weighting filter used for computing |
| the distortion. For a value X, the filter is [1 X 1]/(X + 2) and |
| is applied both horizontally and vertically. For X=5, the filter is |
| a good approximation for the OD_QM8_Q4_HVS quantization matrix. */ |
| #define OD_DIST_LP_MID (5) |
| #define OD_DIST_LP_NORM (OD_DIST_LP_MID + 2) |
| |
| static double od_compute_dist_8x8(int use_activity_masking, uint16_t *x, |
| uint16_t *y, od_coeff *e_lp, int stride) { |
| double sum; |
| int min_var; |
| double mean_var; |
| double var_stat; |
| double activity; |
| double calibration; |
| int i; |
| int j; |
| double vardist; |
| |
| vardist = 0; |
| |
| #if 1 |
| min_var = INT_MAX; |
| mean_var = 0; |
| for (i = 0; i < 3; i++) { |
| for (j = 0; j < 3; j++) { |
| int varx; |
| int vary; |
| varx = od_compute_var_4x4(x + 2 * i * stride + 2 * j, stride); |
| vary = od_compute_var_4x4(y + 2 * i * stride + 2 * j, stride); |
| min_var = OD_MINI(min_var, varx); |
| mean_var += 1. / (1 + varx); |
| /* The cast to (double) is to avoid an overflow before the sqrt.*/ |
| vardist += varx - 2 * sqrt(varx * (double)vary) + vary; |
| } |
| } |
| /* We use a different variance statistic depending on whether activity |
| masking is used, since the harmonic mean appeared slightly worse with |
| masking off. The calibration constant just ensures that we preserve the |
| rate compared to activity=1. */ |
| if (use_activity_masking) { |
| calibration = 1.95; |
| var_stat = 9. / mean_var; |
| } else { |
| calibration = 1.62; |
| var_stat = min_var; |
| } |
| /* 1.62 is a calibration constant, 0.25 is a noise floor and 1/6 is the |
| activity masking constant. */ |
| activity = calibration * pow(.25 + var_stat, -1. / 6); |
| #else |
| activity = 1; |
| #endif // 1 |
| sum = 0; |
| for (i = 0; i < 8; i++) { |
| for (j = 0; j < 8; j++) |
| sum += e_lp[i * stride + j] * (double)e_lp[i * stride + j]; |
| } |
| /* Normalize the filter to unit DC response. */ |
| sum *= 1. / (OD_DIST_LP_NORM * OD_DIST_LP_NORM * OD_DIST_LP_NORM * |
| OD_DIST_LP_NORM); |
| return activity * activity * (sum + vardist); |
| } |
| |
| // Note : Inputs x and y are in a pixel domain |
| static double od_compute_dist_common(int activity_masking, uint16_t *x, |
| uint16_t *y, int bsize_w, int bsize_h, |
| int qindex, od_coeff *tmp, |
| od_coeff *e_lp) { |
| int i, j; |
| double sum = 0; |
| const int mid = OD_DIST_LP_MID; |
| |
| for (j = 0; j < bsize_w; j++) { |
| e_lp[j] = mid * tmp[j] + 2 * tmp[bsize_w + j]; |
| e_lp[(bsize_h - 1) * bsize_w + j] = mid * tmp[(bsize_h - 1) * bsize_w + j] + |
| 2 * tmp[(bsize_h - 2) * bsize_w + j]; |
| } |
| for (i = 1; i < bsize_h - 1; i++) { |
| for (j = 0; j < bsize_w; j++) { |
| e_lp[i * bsize_w + j] = mid * tmp[i * bsize_w + j] + |
| tmp[(i - 1) * bsize_w + j] + |
| tmp[(i + 1) * bsize_w + j]; |
| } |
| } |
| for (i = 0; i < bsize_h; i += 8) { |
| for (j = 0; j < bsize_w; j += 8) { |
| sum += od_compute_dist_8x8(activity_masking, &x[i * bsize_w + j], |
| &y[i * bsize_w + j], &e_lp[i * bsize_w + j], |
| bsize_w); |
| } |
| } |
| /* Scale according to linear regression against SSE, for 8x8 blocks. */ |
| if (activity_masking) { |
| sum *= 2.2 + (1.7 - 2.2) * (qindex - 99) / (210 - 99) + |
| (qindex < 99 ? 2.5 * (qindex - 99) / 99 * (qindex - 99) / 99 : 0); |
| } else { |
| sum *= qindex >= 128 |
| ? 1.4 + (0.9 - 1.4) * (qindex - 128) / (209 - 128) |
| : qindex <= 43 ? 1.5 + (2.0 - 1.5) * (qindex - 43) / (16 - 43) |
| : 1.5 + (1.4 - 1.5) * (qindex - 43) / (128 - 43); |
| } |
| |
| return sum; |
| } |
| |
| static double od_compute_dist(uint16_t *x, uint16_t *y, int bsize_w, |
| int bsize_h, int qindex) { |
| assert(bsize_w >= 8 && bsize_h >= 8); |
| |
| int activity_masking = 0; |
| |
| int i, j; |
| DECLARE_ALIGNED(16, od_coeff, e[MAX_TX_SQUARE]); |
| DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]); |
| DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]); |
| for (i = 0; i < bsize_h; i++) { |
| for (j = 0; j < bsize_w; j++) { |
| e[i * bsize_w + j] = x[i * bsize_w + j] - y[i * bsize_w + j]; |
| } |
| } |
| int mid = OD_DIST_LP_MID; |
| for (i = 0; i < bsize_h; i++) { |
| tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1]; |
| tmp[i * bsize_w + bsize_w - 1] = |
| mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2]; |
| for (j = 1; j < bsize_w - 1; j++) { |
| tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] + e[i * bsize_w + j - 1] + |
| e[i * bsize_w + j + 1]; |
| } |
| } |
| return od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h, |
| qindex, tmp, e_lp); |
| } |
| |
| static double od_compute_dist_diff(uint16_t *x, int16_t *e, int bsize_w, |
| int bsize_h, int qindex) { |
| assert(bsize_w >= 8 && bsize_h >= 8); |
| |
| int activity_masking = 0; |
| |
| DECLARE_ALIGNED(16, uint16_t, y[MAX_TX_SQUARE]); |
| DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]); |
| DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]); |
| int i, j; |
| for (i = 0; i < bsize_h; i++) { |
| for (j = 0; j < bsize_w; j++) { |
| y[i * bsize_w + j] = x[i * bsize_w + j] - e[i * bsize_w + j]; |
| } |
| } |
| int mid = OD_DIST_LP_MID; |
| for (i = 0; i < bsize_h; i++) { |
| tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1]; |
| tmp[i * bsize_w + bsize_w - 1] = |
| mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2]; |
| for (j = 1; j < bsize_w - 1; j++) { |
| tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] + e[i * bsize_w + j - 1] + |
| e[i * bsize_w + j + 1]; |
| } |
| } |
| return od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h, |
| qindex, tmp, e_lp); |
| } |
| |
| int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x, |
| const uint8_t *src, int src_stride, const uint8_t *dst, |
| int dst_stride, const BLOCK_SIZE tx_bsize, int bsw, |
| int bsh, int visible_w, int visible_h, int qindex) { |
| int64_t d = 0; |
| int i, j; |
| const MACROBLOCKD *xd = &x->e_mbd; |
| |
| DECLARE_ALIGNED(16, uint16_t, orig[MAX_TX_SQUARE]); |
| DECLARE_ALIGNED(16, uint16_t, rec[MAX_TX_SQUARE]); |
| |
| assert(bsw >= 8); |
| assert(bsh >= 8); |
| assert((bsw & 0x07) == 0); |
| assert((bsh & 0x07) == 0); |
| |
| if (x->tune_metric == AOM_TUNE_CDEF_DIST || |
| x->tune_metric == AOM_TUNE_DAALA_DIST) { |
| if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { |
| for (j = 0; j < bsh; j++) |
| for (i = 0; i < bsw; i++) |
| orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i]; |
| |
| if ((bsw == visible_w) && (bsh == visible_h)) { |
| for (j = 0; j < bsh; j++) |
| for (i = 0; i < bsw; i++) |
| rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i]; |
| } else { |
| for (j = 0; j < visible_h; j++) |
| for (i = 0; i < visible_w; i++) |
| rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i]; |
| |
| if (visible_w < bsw) { |
| for (j = 0; j < bsh; j++) |
| for (i = visible_w; i < bsw; i++) |
| rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i]; |
| } |
| |
| if (visible_h < bsh) { |
| for (j = visible_h; j < bsh; j++) |
| for (i = 0; i < bsw; i++) |
| rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i]; |
| } |
| } |
| } else { |
| for (j = 0; j < bsh; j++) |
| for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i]; |
| |
| if ((bsw == visible_w) && (bsh == visible_h)) { |
| for (j = 0; j < bsh; j++) |
| for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i]; |
| } else { |
| for (j = 0; j < visible_h; j++) |
| for (i = 0; i < visible_w; i++) |
| rec[j * bsw + i] = dst[j * dst_stride + i]; |
| |
| if (visible_w < bsw) { |
| for (j = 0; j < bsh; j++) |
| for (i = visible_w; i < bsw; i++) |
| rec[j * bsw + i] = src[j * src_stride + i]; |
| } |
| |
| if (visible_h < bsh) { |
| for (j = visible_h; j < bsh; j++) |
| for (i = 0; i < bsw; i++) |
| rec[j * bsw + i] = src[j * src_stride + i]; |
| } |
| } |
| } |
| } |
| |
| if (x->tune_metric == AOM_TUNE_DAALA_DIST) { |
| d = (int64_t)od_compute_dist(orig, rec, bsw, bsh, qindex); |
| } else if (x->tune_metric == AOM_TUNE_CDEF_DIST) { |
| int coeff_shift = AOMMAX(xd->bd - 8, 0); |
| |
| for (i = 0; i < bsh; i += 8) { |
| for (j = 0; j < bsw; j += 8) { |
| d += cdef_dist_8x8_16bit(&rec[i * bsw + j], bsw, &orig[i * bsw + j], |
| bsw, coeff_shift); |
| } |
| } |
| if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) |
| d = ((uint64_t)d) >> 2 * coeff_shift; |
| } else { |
| // Otherwise, MSE by default |
| d = pixel_dist_visible_only(cpi, x, src, src_stride, dst, dst_stride, |
| tx_bsize, bsh, bsw, visible_h, visible_w); |
| } |
| |
| return d; |
| } |
| |
| static int64_t dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src, |
| int src_stride, const int16_t *diff, |
| int diff_stride, int bsw, int bsh, int visible_w, |
| int visible_h, int qindex) { |
| int64_t d = 0; |
| int i, j; |
| const MACROBLOCKD *xd = &x->e_mbd; |
| |
| DECLARE_ALIGNED(16, uint16_t, orig[MAX_TX_SQUARE]); |
| DECLARE_ALIGNED(16, int16_t, diff16[MAX_TX_SQUARE]); |
| |
| assert(bsw >= 8); |
| assert(bsh >= 8); |
| assert((bsw & 0x07) == 0); |
| assert((bsh & 0x07) == 0); |
| |
| if (x->tune_metric == AOM_TUNE_CDEF_DIST || |
| x->tune_metric == AOM_TUNE_DAALA_DIST) { |
| if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { |
| for (j = 0; j < bsh; j++) |
| for (i = 0; i < bsw; i++) |
| orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i]; |
| } else { |
| for (j = 0; j < bsh; j++) |
| for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i]; |
| } |
| |
| if ((bsw == visible_w) && (bsh == visible_h)) { |
| for (j = 0; j < bsh; j++) |
| for (i = 0; i < bsw; i++) |
| diff16[j * bsw + i] = diff[j * diff_stride + i]; |
| } else { |
| for (j = 0; j < visible_h; j++) |
| for (i = 0; i < visible_w; i++) |
| diff16[j * bsw + i] = diff[j * diff_stride + i]; |
| |
| if (visible_w < bsw) { |
| for (j = 0; j < bsh; j++) |
| for (i = visible_w; i < bsw; i++) diff16[j * bsw + i] = 0; |
| } |
| |
| if (visible_h < bsh) { |
| for (j = visible_h; j < bsh; j++) |
| for (i = 0; i < bsw; i++) diff16[j * bsw + i] = 0; |
| } |
| } |
| } |
| |
| if (x->tune_metric == AOM_TUNE_DAALA_DIST) { |
| d = (int64_t)od_compute_dist_diff(orig, diff16, bsw, bsh, qindex); |
| } else if (x->tune_metric == AOM_TUNE_CDEF_DIST) { |
| int coeff_shift = AOMMAX(xd->bd - 8, 0); |
| DECLARE_ALIGNED(16, uint16_t, dst16[MAX_TX_SQUARE]); |
| |
| for (i = 0; i < bsh; i++) { |
| for (j = 0; j < bsw; j++) { |
| dst16[i * bsw + j] = orig[i * bsw + j] - diff16[i * bsw + j]; |
| } |
| } |
| |
| for (i = 0; i < bsh; i += 8) { |
| for (j = 0; j < bsw; j += 8) { |
| d += cdef_dist_8x8_16bit(&dst16[i * bsw + j], bsw, &orig[i * bsw + j], |
| bsw, coeff_shift); |
| } |
| } |
| // Don't scale 'd' for HBD since it will be done by caller side for diff |
| // input |
| } else { |
| // Otherwise, MSE by default |
| d = aom_sum_squares_2d_i16(diff, diff_stride, visible_w, visible_h); |
| } |
| |
| return d; |
| } |
| #endif // CONFIG_DIST_8X8 |
| |
| static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize, |
| const uint8_t *src, int src_stride, |
| const uint8_t *dst, int dst_stride, |
| int need_4th, double *hordist, |
| double *verdist) { |
| const int bw = block_size_wide[bsize]; |
| const int bh = block_size_high[bsize]; |
| unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; |
| |
| if (bsize < BLOCK_16X16 || (bsize >= BLOCK_4X16 && bsize <= BLOCK_32X8)) { |
| // Special cases: calculate 'esq' values manually, as we don't have 'vf' |
| // functions for the 16 (very small) sub-blocks of this block. |
| const int w_shift = (bw == 4) ? 0 : (bw == 8) ? 1 : (bw == 16) ? 2 : 3; |
| const int h_shift = (bh == 4) ? 0 : (bh == 8) ? 1 : (bh == 16) ? 2 : 3; |
| assert(bw <= 32); |
| assert(bh <= 32); |
| assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15); |
| if (cpi->common.use_highbitdepth) { |
| const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); |
| const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); |
| for (int i = 0; i < bh; ++i) |
| for (int j = 0; j < bw; ++j) { |
| const int index = (j >> w_shift) + ((i >> h_shift) << 2); |
| esq[index] += |
| (src16[j + i * src_stride] - dst16[j + i * dst_stride]) * |
| (src16[j + i * src_stride] - dst16[j + i * dst_stride]); |
| } |
| } else { |
| for (int i = 0; i < bh; ++i) |
| for (int j = 0; j < bw; ++j) { |
| const int index = (j >> w_shift) + ((i >> h_shift) << 2); |
| esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) * |
| (src[j + i * src_stride] - dst[j + i * dst_stride]); |
| } |
| } |
| } else { // Calculate 'esq' values using 'vf' functions on the 16 sub-blocks. |
| const int f_index = |
| (bsize < BLOCK_SIZES) ? bsize - BLOCK_16X16 : bsize - BLOCK_8X16; |
| assert(f_index >= 0 && f_index < BLOCK_SIZES_ALL); |
| const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index; |
| assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]); |
| assert(block_size_high[bsize] == 4 * block_size_high[subsize]); |
| cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]); |
| cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, |
| &esq[1]); |
| cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, |
| &esq[2]); |
| cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, |
| dst_stride, &esq[3]); |
| src += bh / 4 * src_stride; |
| dst += bh / 4 * dst_stride; |
| |
| cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]); |
| cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, |
| &esq[5]); |
| cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, |
| &esq[6]); |
| cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, |
| dst_stride, &esq[7]); |
| src += bh / 4 * src_stride; |
| dst += bh / 4 * dst_stride; |
| |
| cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]); |
| cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, |
| &esq[9]); |
| cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, |
| &esq[10]); |
| cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, |
| dst_stride, &esq[11]); |
| src += bh / 4 * src_stride; |
| dst += bh / 4 * dst_stride; |
| |
| cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]); |
| cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, |
| &esq[13]); |
| cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, |
| &esq[14]); |
| cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, |
| dst_stride, &esq[15]); |
| } |
| |
| double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] + |
| esq[6] + esq[7] + esq[8] + esq[9] + esq[10] + esq[11] + |
| esq[12] + esq[13] + esq[14] + esq[15]; |
| if (total > 0) { |
| const double e_recip = 1.0 / total; |
| hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip; |
| hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip; |
| hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip; |
| if (need_4th) { |
| hordist[3] = ((double)esq[3] + esq[7] + esq[11] + esq[15]) * e_recip; |
| } |
| verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip; |
| verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip; |
| verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip; |
| if (need_4th) { |
| verdist[3] = ((double)esq[12] + esq[13] + esq[14] + esq[15]) * e_recip; |
| } |
| } else { |
| hordist[0] = verdist[0] = 0.25; |
| hordist[1] = verdist[1] = 0.25; |
| hordist[2] = verdist[2] = 0.25; |
| if (need_4th) { |
| hordist[3] = verdist[3] = 0.25; |
| } |
| } |
| } |
| |
| static int adst_vs_flipadst(const AV1_COMP *cpi, BLOCK_SIZE bsize, |
| const uint8_t *src, int src_stride, |
| const uint8_t *dst, int dst_stride) { |
| int prune_bitmask = 0; |
| double svm_proj_h = 0, svm_proj_v = 0; |
| double hdist[3] = { 0, 0, 0 }, vdist[3] = { 0, 0, 0 }; |
| get_energy_distribution_fine(cpi, bsize, src, src_stride, dst, dst_stride, 0, |
| hdist, vdist); |
| |
| svm_proj_v = vdist[0] * ADST_FLIP_SVM[0] + vdist[1] * ADST_FLIP_SVM[1] + |
| vdist[2] * ADST_FLIP_SVM[2] + ADST_FLIP_SVM[3]; |
| svm_proj_h = hdist[0] * ADST_FLIP_SVM[4] + hdist[1] * ADST_FLIP_SVM[5] + |
| hdist[2] * ADST_FLIP_SVM[6] + ADST_FLIP_SVM[7]; |
| if (svm_proj_v > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN) |
| prune_bitmask |= 1 << FLIPADST_1D; |
| else if (svm_proj_v < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN) |
| prune_bitmask |= 1 << ADST_1D; |
| |
| if (svm_proj_h > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN) |
| prune_bitmask |= 1 << (FLIPADST_1D + 8); |
| else if (svm_proj_h < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN) |
| prune_bitmask |= 1 << (ADST_1D + 8); |
| |
| return prune_bitmask; |
| } |
| |
| static void get_horver_correlation(const int16_t *diff, int stride, int w, |
| int h, double *hcorr, double *vcorr) { |
| // Returns hor/ver correlation coefficient |
| const int num = (h - 1) * (w - 1); |
| double num_r; |
| int i, j; |
| int64_t xy_sum = 0, xz_sum = 0; |
| int64_t x_sum = 0, y_sum = 0, z_sum = 0; |
| int64_t x2_sum = 0, y2_sum = 0, z2_sum = 0; |
| double x_var_n, y_var_n, z_var_n, xy_var_n, xz_var_n; |
| *hcorr = *vcorr = 1; |
| |
| assert(num > 0); |
| num_r = 1.0 / num; |
| for (i = 1; i < h; ++i) { |
| for (j = 1; j < w; ++j) { |
| const int16_t x = diff[i * stride + j]; |
| const int16_t y = diff[i * stride + j - 1]; |
| const int16_t z = diff[(i - 1) * stride + j]; |
| xy_sum += x * y; |
| xz_sum += x * z; |
| x_sum += x; |
| y_sum += y; |
| z_sum += z; |
| x2_sum += x * x; |
| y2_sum += y * y; |
| z2_sum += z * z; |
| } |
| } |
| x_var_n = x2_sum - (x_sum * x_sum) * num_r; |
| y_var_n = y2_sum - (y_sum * y_sum) * num_r; |
| z_var_n = z2_sum - (z_sum * z_sum) * num_r; |
| xy_var_n = xy_sum - (x_sum * y_sum) * num_r; |
| xz_var_n = xz_sum - (x_sum * z_sum) * num_r; |
| if (x_var_n > 0 && y_var_n > 0) { |
| *hcorr = xy_var_n / sqrt(x_var_n * y_var_n); |
| *hcorr = *hcorr < 0 ? 0 : *hcorr; |
| } |
| if (x_var_n > 0 && z_var_n > 0) { |
| *vcorr = xz_var_n / sqrt(x_var_n * z_var_n); |
| *vcorr = *vcorr < 0 ? 0 : *vcorr; |
| } |
| } |
| |
| static int dct_vs_idtx(const int16_t *diff, int stride, int w, int h) { |
| double hcorr, vcorr; |
| int prune_bitmask = 0; |
| get_horver_correlation(diff, stride, w, h, &hcorr, &vcorr); |
| |
| if (vcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN) |
| prune_bitmask |= 1 << IDTX_1D; |
| else if (vcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN) |
| prune_bitmask |= 1 << DCT_1D; |
| |
| if (hcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN) |
| prune_bitmask |= 1 << (IDTX_1D + 8); |
| else if (hcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN) |
| prune_bitmask |= 1 << (DCT_1D + 8); |
| return prune_bitmask; |
| } |
| |
| // Performance drop: 0.5%, Speed improvement: 24% |
| static int prune_two_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize, |
| MACROBLOCK *x, const MACROBLOCKD *xd, |
| int adst_flipadst, int dct_idtx) { |
| int prune = 0; |
| |
| if (adst_flipadst) { |
| const struct macroblock_plane *const p = &x->plane[0]; |
| const struct macroblockd_plane *const pd = &xd->plane[0]; |
| prune |= adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride, |
| pd->dst.buf, pd->dst.stride); |
| } |
| if (dct_idtx) { |
| av1_subtract_plane(x, bsize, 0); |
| const struct macroblock_plane *const p = &x->plane[0]; |
| const int bw = block_size_wide[bsize]; |
| const int bh = block_size_high[bsize]; |
| prune |= dct_vs_idtx(p->src_diff, bw, bw, bh); |
| } |
| |
| return prune; |
| } |
| |
| // Performance drop: 0.3%, Speed improvement: 5% |
| static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize, |
| const MACROBLOCK *x, const MACROBLOCKD *xd) { |
| const struct macroblock_plane *const p = &x->plane[0]; |
| const struct macroblockd_plane *const pd = &xd->plane[0]; |
| return adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride, pd->dst.buf, |
| pd->dst.stride); |
| } |
| |
| // 1D Transforms used in inter set, this needs to be changed if |
| // ext_tx_used_inter is changed |
| static const int ext_tx_used_inter_1D[EXT_TX_SETS_INTER][TX_TYPES_1D] = { |
| { 1, 0, 0, 0 }, |
| { 1, 1, 1, 1 }, |
| { 1, 1, 1, 1 }, |
| { 1, 0, 0, 1 }, |
| }; |
| |
| static void get_energy_distribution_finer(const int16_t *diff, int stride, |
| int bw, int bh, float *hordist, |
| float *verdist) { |
| // First compute downscaled block energy values (esq); downscale factors |
| // are defined by w_shift and h_shift. |
| unsigned int esq[256]; |
| const int w_shift = bw <= 8 ? 0 : 1; |
| const int h_shift = bh <= 8 ? 0 : 1; |
| const int esq_w = bw <= 8 ? bw : bw / 2; |
| const int esq_h = bh <= 8 ? bh : bh / 2; |
| const int esq_sz = esq_w * esq_h; |
| int i, j; |
| memset(esq, 0, esq_sz * sizeof(esq[0])); |
| for (i = 0; i < bh; i++) { |
| unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w; |
| const int16_t *cur_diff_row = diff + i * stride; |
| for (j = 0; j < bw; j++) { |
| cur_esq_row[j >> w_shift] += cur_diff_row[j] * cur_diff_row[j]; |
| } |
| } |
| |
| uint64_t total = 0; |
| for (i = 0; i < esq_sz; i++) total += esq[i]; |
| |
| // Output hordist and verdist arrays are normalized 1D projections of esq |
| if (total == 0) { |
| float hor_val = 1.0f / esq_w; |
| for (j = 0; j < esq_w - 1; j++) hordist[j] = hor_val; |
| float ver_val = 1.0f / esq_h; |
| for (i = 0; i < esq_h - 1; i++) verdist[i] = ver_val; |
| return; |
| } |
| |
| const float e_recip = 1.0f / (float)total; |
| memset(hordist, 0, (esq_w - 1) * sizeof(hordist[0])); |
| memset(verdist, 0, (esq_h - 1) * sizeof(verdist[0])); |
| const unsigned int *cur_esq_row; |
| for (i = 0; i < esq_h - 1; i++) { |
| cur_esq_row = esq + i * esq_w; |
| for (j = 0; j < esq_w - 1; j++) { |
| hordist[j] += (float)cur_esq_row[j]; |
| verdist[i] += (float)cur_esq_row[j]; |
| } |
| verdist[i] += (float)cur_esq_row[j]; |
| } |
| cur_esq_row = esq + i * esq_w; |
| for (j = 0; j < esq_w - 1; j++) hordist[j] += (float)cur_esq_row[j]; |
| |
| for (j = 0; j < esq_w - 1; j++) hordist[j] *= e_recip; |
| for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip; |
| } |
| |
| // Similar to get_horver_correlation, but also takes into account first |
| // row/column, when computing horizontal/vertical correlation. |
| static void get_horver_correlation_full(const int16_t *diff, int stride, int w, |
| int h, float *hcorr, float *vcorr) { |
| const float num_hor = (float)(h * (w - 1)); |
| const float num_ver = (float)((h - 1) * w); |
| int i, j; |
| |
| // The following notation is used: |
| // x - current pixel |
| // y - left neighbor pixel |
| // z - top neighbor pixel |
| int64_t xy_sum = 0, xz_sum = 0; |
| int64_t xhor_sum = 0, xver_sum = 0, y_sum = 0, z_sum = 0; |
| int64_t x2hor_sum = 0, x2ver_sum = 0, y2_sum = 0, z2_sum = 0; |
| |
| int16_t x, y, z; |
| for (j = 1; j < w; ++j) { |
| x = diff[j]; |
| y = diff[j - 1]; |
| xy_sum += x * y; |
| xhor_sum += x; |
| y_sum += y; |
| x2hor_sum += x * x; |
| y2_sum += y * y; |
| } |
| for (i = 1; i < h; ++i) { |
| x = diff[i * stride]; |
| z = diff[(i - 1) * stride]; |
| xz_sum += x * z; |
| xver_sum += x; |
| z_sum += z; |
| x2ver_sum += x * x; |
| z2_sum += z * z; |
| for (j = 1; j < w; ++j) { |
| x = diff[i * stride + j]; |
| y = diff[i * stride + j - 1]; |
| z = diff[(i - 1) * stride + j]; |
| xy_sum += x * y; |
| xz_sum += x * z; |
| xhor_sum += x; |
| xver_sum += x; |
| y_sum += y; |
| z_sum += z; |
| x2hor_sum += x * x; |
| x2ver_sum += x * x; |
| y2_sum += y * y; |
| z2_sum += z * z; |
| } |
| } |
| const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor; |
| const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor; |
| const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor; |
| const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver; |
| const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver; |
| const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver; |
| |
| *hcorr = *vcorr = 1; |
| if (xhor_var_n > 0 && y_var_n > 0) { |
| *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n); |
| *hcorr = *hcorr < 0 ? 0 : *hcorr; |
| } |
| if (xver_var_n > 0 && z_var_n > 0) { |
| *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n); |
| *vcorr = *vcorr < 0 ? 0 : *vcorr; |
| } |
| } |
| |
| // Transforms raw scores into a probability distribution across 16 TX types |
| static void score_2D_transform_pow8(float *scores_2D, float shift) { |
| float sum = 0.0f; |
| int i; |
| |
| for (i = 0; i < 16; i++) { |
| float v, v2, v4; |
| v = AOMMAX(scores_2D[i] + shift, 0.0f); |
| v2 = v * v; |
| v4 = v2 * v2; |
| scores_2D[i] = v4 * v4; |
| sum += scores_2D[i]; |
| } |
| for (i = 0; i < 16; i++) scores_2D[i] /= sum; |
| } |
| |
| // These thresholds were calibrated to provide a certain number of TX types |
| // pruned by the model on average, i.e. selecting a threshold with index i |
| // will lead to pruning i+1 TX types on average |
| static const float *prune_2D_adaptive_thresholds[] = { |
| // TX_4X4 |
| (float[]){ 0.02014f, 0.02722f, 0.03430f, 0.04114f, 0.04724f, 0.05212f, |
| 0.05627f, 0.06018f, 0.06409f, 0.06824f, 0.07312f, 0.07849f, |
| 0.08606f, 0.09827f }, |
| // TX_8X8 |
| (float[]){ 0.00745f, 0.01355f, 0.02039f, 0.02795f, 0.03625f, 0.04407f, |
| 0.05042f, 0.05579f, 0.06067f, 0.06604f, 0.07239f, 0.08093f, |
| 0.09363f, 0.11682f }, |
| // TX_16X16 |
| (float[]){ 0.01404f, 0.02820f, 0.04211f, 0.05164f, 0.05798f, 0.06335f, |
| 0.06897f, 0.07629f, 0.08875f, 0.11169f }, |
| // TX_32X32 |
| NULL, |
| // TX_64X64 |
| NULL, |
| // TX_4X8 |
| (float[]){ 0.01282f, 0.02087f, 0.02844f, 0.03601f, 0.04285f, 0.04871f, |
| 0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f, |
| 0.09119f, 0.10828f }, |
| // TX_8X4 |
| (float[]){ 0.01184f, 0.01941f, 0.02722f, 0.03503f, 0.04187f, 0.04822f, |
| 0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f, |
| 0.09167f, 0.10974f }, |
| // TX_8X16 |
| (float[]){ 0.00525f, 0.01135f, 0.01819f, 0.02576f, 0.03357f, 0.04114f, |
| 0.04773f, 0.05383f, 0.05920f, 0.06506f, 0.07190f, 0.08118f, |
| 0.09509f, 0.12097f }, |
| // TX_16X8 |
| (float[]){ 0.00525f, 0.01160f, 0.01819f, 0.02527f, 0.03308f, 0.04065f, |
| 0.04773f, 0.05383f, 0.05969f, 0.06531f, 0.07214f, 0.08118f, |
| 0.09485f, 0.12048f }, |
| // TX_16X32 |
| (float[]){ 0.01257f, 0.02576f, 0.03723f, 0.04578f, 0.05212f, 0.05798f, |
| 0.06506f, 0.07385f, 0.08606f, 0.10925f }, |
| // TX_32X16 |
| (float[]){ 0.01233f, 0.02527f, 0.03699f, 0.04602f, 0.05286f, 0.05896f, |
| 0.06531f, 0.07336f, 0.08582f, 0.11072f }, |
| // TX_32X64 |
| NULL, |
| // TX_64X32 |
| NULL, |
| // TX_4X16 |
| NULL, |
| // TX_16X4 |
| NULL, |
| // TX_8X32 |
| NULL, |
| // TX_32X8 |
| NULL, |
| // TX_16X64 |
| NULL, |
| // TX_64X16 |
| NULL, |
| }; |
| |
| static int prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size, |
| int blk_row, int blk_col, TxSetType tx_set_type, |
| TX_TYPE_PRUNE_MODE prune_mode) { |
| static const int tx_type_table_2D[16] = { |
| DCT_DCT, DCT_ADST, DCT_FLIPADST, V_DCT, |
| ADST_DCT, ADST_ADST, ADST_FLIPADST, V_ADST, |
| FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST, |
| H_DCT, H_ADST, H_FLIPADST, IDTX |
| }; |
| if (tx_set_type != EXT_TX_SET_ALL16 && |
| tx_set_type != EXT_TX_SET_DTT9_IDTX_1DDCT) |
| return 0; |
| const NN_CONFIG *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size]; |
| const NN_CONFIG *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size]; |
| if (!nn_config_hor || !nn_config_ver) return 0; // Model not established yet. |
| |
| aom_clear_system_state(); |
| float hfeatures[16], vfeatures[16]; |
| float hscores[4], vscores[4]; |
| float scores_2D[16]; |
| const int bw = tx_size_wide[tx_size]; |
| const int bh = tx_size_high[tx_size]; |
| const int hfeatures_num = bw <= 8 ? bw : bw / 2; |
| const int vfeatures_num = bh <= 8 ? bh : bh / 2; |
| assert(hfeatures_num <= 16); |
| assert(vfeatures_num <= 16); |
| |
| const struct macroblock_plane *const p = &x->plane[0]; |
| const int diff_stride = block_size_wide[bsize]; |
| const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col; |
| get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures, |
| vfeatures); |
| get_horver_correlation_full(diff, diff_stride, bw, bh, |
| &hfeatures[hfeatures_num - 1], |
| &vfeatures[vfeatures_num - 1]); |
| av1_nn_predict(hfeatures, nn_config_hor, hscores); |
| av1_nn_predict(vfeatures, nn_config_ver, vscores); |
| |
| float score_2D_average = 0.0f; |
| for (int i = 0; i < 4; i++) { |
| float *cur_scores_2D = scores_2D + i * 4; |
| cur_scores_2D[0] = vscores[i] * hscores[0]; |
| cur_scores_2D[1] = vscores[i] * hscores[1]; |
| cur_scores_2D[2] = vscores[i] * hscores[2]; |
| cur_scores_2D[3] = vscores[i] * hscores[3]; |
| score_2D_average += cur_scores_2D[0] + cur_scores_2D[1] + cur_scores_2D[2] + |
| cur_scores_2D[3]; |
| } |
| score_2D_average /= 16; |
| score_2D_transform_pow8(scores_2D, (20 - score_2D_average)); |
| |
| // Always keep the TX type with the highest score, prune all others with |
| // score below score_thresh. |
| int max_score_i = 0; |
| float max_score = 0.0f; |
| for (int i = 0; i < 16; i++) { |
| if (scores_2D[i] > max_score && |
| av1_ext_tx_used[tx_set_type][tx_type_table_2D[i]]) { |
| max_score = scores_2D[i]; |
| max_score_i = i; |
| } |
| } |
| |
| int pruning_aggressiveness = 0; |
| if (prune_mode == PRUNE_2D_ACCURATE) { |
| if (tx_set_type == EXT_TX_SET_ALL16) |
| pruning_aggressiveness = 6; |
| else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) |
| pruning_aggressiveness = 4; |
| } else if (prune_mode == PRUNE_2D_FAST) { |
| if (tx_set_type == EXT_TX_SET_ALL16) |
| pruning_aggressiveness = 10; |
| else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) |
| pruning_aggressiveness = 7; |
| } |
| const float score_thresh = |
| prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness - 1]; |
| |
| int prune_bitmask = 0; |
| for (int i = 0; i < 16; i++) { |
| if (scores_2D[i] < score_thresh && i != max_score_i) |
| prune_bitmask |= (1 << tx_type_table_2D[i]); |
| } |
| return prune_bitmask; |
| } |
| |
| static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, |
| const MACROBLOCKD *const xd, int tx_set_type) { |
| av1_zero(x->tx_search_prune); |
| x->tx_split_prune_flag = 0; |
| const MB_MODE_INFO *mbmi = xd->mi[0]; |
| if (!is_inter_block(mbmi) || cpi->sf.tx_type_search.prune_mode == NO_PRUNE || |
| x->use_default_inter_tx_type || xd->lossless[mbmi->segment_id] || |
| x->cb_partition_scan) |
| return; |
| int tx_set = ext_tx_set_index[1][tx_set_type]; |
| assert(tx_set >= 0); |
| const int *tx_set_1D = ext_tx_used_inter_1D[tx_set]; |
| switch (cpi->sf.tx_type_search.prune_mode) { |
| case NO_PRUNE: return; |
| case PRUNE_ONE: |
| if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) return; |
| x->tx_search_prune[tx_set_type] = prune_one_for_sby(cpi, bsize, x, xd); |
| break; |
| case PRUNE_TWO: |
| if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) { |
| if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return; |
| x->tx_search_prune[tx_set_type] = |
| prune_two_for_sby(cpi, bsize, x, xd, 0, 1); |
| } |
| if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) { |
| x->tx_search_prune[tx_set_type] = |
| prune_two_for_sby(cpi, bsize, x, xd, 1, 0); |
| } |
| x->tx_search_prune[tx_set_type] = |
| prune_two_for_sby(cpi, bsize, x, xd, 1, 1); |
| break; |
| case PRUNE_2D_ACCURATE: |
| case PRUNE_2D_FAST: break; |
| default: assert(0); |
| } |
| } |
| |
| static int do_tx_type_search(TX_TYPE tx_type, int prune, |
| TX_TYPE_PRUNE_MODE mode) { |
| // TODO(sarahparker) implement for non ext tx |
| if (mode >= PRUNE_2D_ACCURATE) { |
| return !((prune >> tx_type) & 1); |
| } else { |
| return !(((prune >> vtx_tab[tx_type]) & 1) | |
| ((prune >> (htx_tab[tx_type] + 8)) & 1)); |
| } |
| } |
| |
| static void model_rd_from_sse(const AV1_COMP *const cpi, |
| const MACROBLOCKD *const xd, BLOCK_SIZE bsize, |
| int plane, int64_t sse, int *rate, |
| int64_t *dist) { |
| const struct macroblockd_plane *const pd = &xd->plane[plane]; |
| const int dequant_shift = |
| (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; |
| |
| // Fast approximate the modelling function. |
| if (cpi->sf.simple_model_rd_from_var) { |
| const int64_t square_error = sse; |
| int quantizer = (pd->dequant_Q3[1] >> dequant_shift); |
| if (quantizer < 120) |
| *rate = (int)((square_error * (280 - quantizer)) >> |
| (16 - AV1_PROB_COST_SHIFT)); |
| else |
| *rate = 0; |
| *dist = (square_error * quantizer) >> 8; |
| } else { |
| av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[bsize], |
| pd->dequant_Q3[1] >> dequant_shift, rate, |
| dist); |
| } |
| *dist <<= 4; |
| } |
| |
| #if CONFIG_COLLECT_INTER_MODE_RD_STATS |
| static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) { |
| const AV1_COMMON *cm = &cpi->common; |
| const int num_planes = av1_num_planes(cm); |
| const MACROBLOCKD *xd = &x->e_mbd; |
| const MB_MODE_INFO *mbmi = xd->mi[0]; |
| int64_t total_sse = 0; |
| for (int plane = 0; plane < num_planes; ++plane) { |
| const struct macroblock_plane *const p = &x->plane[plane]; |
| const struct macroblockd_plane *const pd = &xd->plane[plane]; |
| const BLOCK_SIZE bs = get_plane_block_size(mbmi->sb_type, pd->subsampling_x, |
| pd->subsampling_y); |
| unsigned int sse; |
| |
| if (x->skip_chroma_rd && plane) continue; |
| |
| cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, |
| &sse); |
| total_sse += sse; |
| } |
| total_sse <<= 4; |
| return total_sse; |
| } |
| #endif |
| |
| static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, |
| MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, |
| int plane_to, int *out_rate_sum, |
| int64_t *out_dist_sum, int *skip_txfm_sb, |
| int64_t *skip_sse_sb, int *plane_rate, |
| int64_t *plane_sse, int64_t *plane_dist) { |
| // Note our transform coeffs are 8 times an orthogonal transform. |
| // Hence quantizer step is also 8 times. To get effective quantizer |
| // we need to divide by 8 before sending to modeling function. |
| int plane; |
| const int ref = xd->mi[0]->ref_frame[0]; |
| |
| int64_t rate_sum = 0; |
| int64_t dist_sum = 0; |
| int64_t total_sse = 0; |
| |
| x->pred_sse[ref] = 0; |
| |
| for (plane = plane_from; plane <= plane_to; ++plane) { |
| struct macroblock_plane *const p = &x->plane[plane]; |
| struct macroblockd_plane *const pd = &xd->plane[plane]; |
| const BLOCK_SIZE bs = |
| get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); |
| unsigned int sse; |
| int rate; |
| int64_t dist; |
| |
| if (x->skip_chroma_rd && plane) continue; |
| |
| // TODO(geza): Write direct sse functions that do not compute |
| // variance as well. |
| cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, |
| &sse); |
| |
| if (plane == 0) x->pred_sse[ref] = sse; |
| |
| total_sse += sse; |
| |
| model_rd_from_sse(cpi, xd, bs, plane, sse, &rate, &dist); |
| |
| rate_sum += rate; |
| dist_sum += dist; |
| if (plane_rate) plane_rate[plane] = rate; |
| if (plane_sse) plane_sse[plane] = sse; |
| if (plane_dist) plane_dist[plane] = dist; |
| } |
| |
| if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; |
| if (skip_sse_sb) *skip_sse_sb = total_sse << 4; |
| *out_rate_sum = (int)rate_sum; |
| *out_dist_sum = dist_sum; |
| } |
| |
| static void check_block_skip(const AV1_COMP *const cpi, BLOCK_SIZE bsize, |
| MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, |
| int plane_to, int *skip_txfm_sb) { |
| *skip_txfm_sb = 1; |
| for (int plane = plane_from; plane <= plane_to; ++plane) { |
| struct macroblock_plane *const p = &x->plane[plane]; |
| struct macroblockd_plane *const pd = &xd->plane[plane]; |
| const BLOCK_SIZE bs = |
| get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); |
| unsigned int sse; |
| |
| if (x->skip_chroma_rd && plane) continue; |
| |
| // Since fast HBD variance functions scale down sse by 4 bit, we first use |
| // fast vf implementation to rule out blocks with non-zero scaled sse. Then, |
| // only if the source is HBD and the scaled sse is 0, accurate sse |
| // computation is applied to determine if the sse is really 0. This step is |
| // necessary for HBD lossless coding. |
| cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, |
| &sse); |
| if (sse) { |
| *skip_txfm_sb = 0; |
| return; |
| } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { |
| uint64_t sse64 = aom_highbd_sse_odd_size( |
| p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, |
| block_size_wide[bs], block_size_high[bs]); |
| |
| if (sse64) { |
| *skip_txfm_sb = 0; |
| return; |
| } |
| } |
| } |
| return; |
| } |
| |
| int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, |
| intptr_t block_size, int64_t *ssz) { |
| int i; |
| int64_t error = 0, sqcoeff = 0; |
| |
| for (i = 0; i < block_size; i++) { |
| const int diff = coeff[i] - dqcoeff[i]; |
| error += diff * diff; |
| sqcoeff += coeff[i] * coeff[i]; |
| } |
| |
| *ssz = sqcoeff; |
| return error; |
| } |
| |
| int64_t av1_highbd_block_error_c(const tran_low_t *coeff, |
| const tran_low_t *dqcoeff, intptr_t block_size, |
| int64_t *ssz, int bd) { |
| int i; |
| int64_t error = 0, sqcoeff = 0; |
| int shift = 2 * (bd - 8); |
| int rounding = shift > 0 ? 1 << (shift - 1) : 0; |
| |
| for (i = 0; i < block_size; i++) { |
| const int64_t diff = coeff[i] - dqcoeff[i]; |
| error += diff * diff; |
| sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i]; |
| } |
| assert(error >= 0 && sqcoeff >= 0); |
| error = (error + rounding) >> shift; |
| sqcoeff = (sqcoeff + rounding) >> shift; |
| |
| *ssz = sqcoeff; |
| return error; |
| } |
| |
| // Get transform block visible dimensions cropped to the MI units. |
| static void get_txb_dimensions(const MACROBLOCKD *xd, int plane, |
| BLOCK_SIZE plane_bsize, int blk_row, int blk_col, |
| BLOCK_SIZE tx_bsize, int *width, int *height, |
| int *visible_width, int *visible_height) { |
| assert(tx_bsize <= plane_bsize); |
| int txb_height = block_size_high[tx_bsize]; |
| int txb_width = block_size_wide[tx_bsize]; |
| const int block_height = block_size_high[plane_bsize]; |
| const int block_width = block_size_wide[plane_bsize]; |
| const struct macroblockd_plane *const pd = &xd->plane[plane]; |
| // TODO(aconverse@google.com): Investigate using crop_width/height here rather |
| // than the MI size |
| const int block_rows = |
| (xd->mb_to_bottom_edge >= 0) |
| ? block_height |
| : (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + block_height; |
| const int block_cols = |
| (xd->mb_to_right_edge >= 0) |
| ? block_width |
| : (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + block_width; |
| const int tx_unit_size = tx_size_wide_log2[0]; |
| if (width) *width = txb_width; |
| if (height) *height = txb_height; |
| *visible_width = clamp(block_cols - (blk_col << tx_unit_size), 0, txb_width); |
| *visible_height = |
| clamp(block_rows - (blk_row << tx_unit_size), 0, txb_height); |
| } |
| |
| // Compute the pixel domain distortion from src and dst on all visible 4x4s in |
| // the |
| // transform block. |
| static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x, |
| int plane, const uint8_t *src, const int src_stride, |
| const uint8_t *dst, const int dst_stride, |
| int blk_row, int blk_col, |
| const BLOCK_SIZE plane_bsize, |
| const BLOCK_SIZE tx_bsize) { |
| int txb_rows, txb_cols, visible_rows, visible_cols; |
| const MACROBLOCKD *xd = &x->e_mbd; |
| |
| get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, |
| &txb_cols, &txb_rows, &visible_cols, &visible_rows); |
| assert(visible_rows > 0); |
| assert(visible_cols > 0); |
| |
| #if CONFIG_DIST_8X8 |
| if (x->using_dist_8x8 && plane == 0 && txb_cols >= 8 && txb_rows >= 8) |
| return (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, |
| tx_bsize, txb_cols, txb_rows, visible_cols, |
| visible_rows, x->qindex); |
| #endif // CONFIG_DIST_8X8 |
| |
| unsigned sse = pixel_dist_visible_only(cpi, x, src, src_stride, dst, |
| dst_stride, tx_bsize, txb_rows, |
| txb_cols, visible_rows, visible_cols); |
| |
| return sse; |
| } |
| |
| // Compute the pixel domain distortion from diff on all visible 4x4s in the |
| // transform block. |
| static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane, |
| int blk_row, int blk_col, |
| const BLOCK_SIZE plane_bsize, |
| const BLOCK_SIZE tx_bsize) { |
| int visible_rows, visible_cols; |
| const MACROBLOCKD *xd = &x->e_mbd; |
| get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL, |
| NULL, &visible_cols, &visible_rows); |
| const int diff_stride = block_size_wide[plane_bsize]; |
| const int16_t *diff = x->plane[plane].src_diff; |
| #if CONFIG_DIST_8X8 |
| int txb_height = block_size_high[tx_bsize]; |
| int txb_width = block_size_wide[tx_bsize]; |
| if (x->using_dist_8x8 && plane == 0 && txb_width >= 8 && txb_height >= 8) { |
| const int src_stride = x->plane[plane].src.stride; |
| const int src_idx = (blk_row * src_stride + blk_col) |
| << tx_size_wide_log2[0]; |
| const uint8_t *src = &x->plane[plane].src.buf[src_idx]; |
| return dist_8x8_diff(x, src, src_stride, diff, diff_stride, txb_width, |
| txb_height, visible_cols, visible_rows, x->qindex); |
| } |
| #endif |
| diff += ((blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]); |
| return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows); |
| } |
| |
| int av1_count_colors(const uint8_t *src, int stride, int rows, int cols, |
| int *val_count) { |
| const int max_pix_val = 1 << 8; |
| memset(val_count, 0, max_pix_val * sizeof(val_count[0])); |
| for (int r = 0; r < rows; ++r) { |
| for (int c = 0; c < cols; ++c) { |
| const int this_val = src[r * stride + c]; |
| assert(this_val < max_pix_val); |
| ++val_count[this_val]; |
| } |
| } |
| int n = 0; |
| for (int i = 0; i < max_pix_val; ++i) { |
| if (val_count[i]) ++n; |
| } |
| return n; |
| } |
| |
| int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols, |
| int bit_depth, int *val_count) { |
| assert(bit_depth <= 12); |
| const int max_pix_val = 1 << bit_depth; |
| const uint16_t *src = CONVERT_TO_SHORTPTR(src8); |
| memset(val_count, 0, max_pix_val * sizeof(val_count[0])); |
| for (int r = 0; r < rows; ++r) { |
| for (int c = 0; c < cols; ++c) { |
| const int this_val = src[r * stride + c]; |
| assert(this_val < max_pix_val); |
| if (this_val >= max_pix_val) return 0; |
| ++val_count[this_val]; |
| } |
| } |
| int n = 0; |
| for (int i = 0; i < max_pix_val; ++i) { |
| if (val_count[i]) ++n; |
| } |
| return n; |
| } |
| |
| static void inverse_transform_block_facade(MACROBLOCKD *xd, int plane, |
| int block, int blk_row, int blk_col, |
| int eob, int reduced_tx_set) { |
| struct macroblockd_plane *const pd = &xd->plane[plane]; |
| tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); |
| const PLANE_TYPE plane_type = get_plane_type(plane); |
| const TX_SIZE tx_size = av1_get_tx_size(plane, xd); |
| const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, |
| tx_size, reduced_tx_set); |
| const int dst_stride = pd->dst.stride; |
| uint8_t *dst = |
| &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]]; |
| av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, |
| dst_stride, eob, reduced_tx_set); |
| } |
| |
| static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record, const uint32_t hash); |
| |
| static uint32_t get_intra_txb_hash(MACROBLOCK *x, int plane, int blk_row, |
| int blk_col, BLOCK_SIZE plane_bsize, |
| TX_SIZE tx_size) { |
| int16_t tmp_data[64 * 64]; |
| const int diff_stride = block_size_wide[plane_bsize]; |
| const int16_t *diff = x->plane[plane].src_diff; |
| const int16_t *cur_diff_row = diff + 4 * blk_row * diff_stride + 4 * blk_col; |
| const int txb_w = tx_size_wide[tx_size]; |
| const int txb_h = tx_size_high[tx_size]; |
| uint8_t *hash_data = (uint8_t *)cur_diff_row; |
| if (txb_w != diff_stride) { |
| int16_t *cur_hash_row = tmp_data; |
| for (int i = 0; i < txb_h; i++) { |
| memcpy(cur_hash_row, cur_diff_row, sizeof(*diff) * txb_w); |
| cur_hash_row += txb_w; |
| cur_diff_row += diff_stride; |
| } |
| hash_data = (uint8_t *)tmp_data; |
| } |
| CRC32C *crc = &x->mb_rd_record.crc_calculator; |
| const uint32_t hash = av1_get_crc32c_value(crc, hash_data, 2 * txb_w * txb_h); |
| return (hash << 5) + tx_size; |
| } |
| |
| static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block, |
| TX_SIZE tx_size, int64_t *out_dist, |
| int64_t *out_sse) { |
| MACROBLOCKD *const xd = &x->e_mbd; |
| const struct macroblock_plane *const p = &x->plane[plane]; |
| const struct macroblockd_plane *const pd = &xd->plane[plane]; |
| // Transform domain distortion computation is more efficient as it does |
| // not involve an inverse transform, but it is less accurate. |
| const int buffer_length = av1_get_max_eob(tx_size); |
| int64_t this_sse; |
| // TX-domain results need to shift down to Q2/D10 to match pixel |
| // domain distortion values which are in Q2^2 |
| int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2; |
| tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); |
| tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); |
| |
| if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) |
| *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse, |
| xd->bd); |
| else |
| *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse); |
| |
| *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift); |
| *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift); |
| } |
| |
| static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x, |
| int plane, BLOCK_SIZE plane_bsize, |
| int block, int blk_row, int blk_col, |
| TX_SIZE tx_size) { |
| MACROBLOCKD *const xd = &x->e_mbd; |
| const struct macroblock_plane *const p = &x->plane[plane]; |
| const struct macroblockd_plane *const pd = &xd->plane[plane]; |
| const uint16_t eob = p->eobs[block]; |
| const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; |
| const int bsw = block_size_wide[tx_bsize]; |
| const int bsh = block_size_high[tx_bsize]; |
| const int src_stride = x->plane[plane].src.stride; |
| const int dst_stride = xd->plane[plane].dst.stride; |
| // Scale the transform block index to pixel unit. |
| const int src_idx = (blk_row * src_stride + blk_col) << tx_size_wide_log2[0]; |
| const int dst_idx = (blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]; |
| const uint8_t *src = &x->plane[plane].src.buf[src_idx]; |
| const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx]; |
| const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); |
| |
| assert(cpi != NULL); |
| assert(tx_size_wide_log2[0] == tx_size_high_log2[0]); |
| |
| uint8_t *recon; |
| DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]); |
| |
| if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { |
| recon = CONVERT_TO_BYTEPTR(recon16); |
| av1_highbd_convolve_2d_copy_sr(CONVERT_TO_SHORTPTR(dst), dst_stride, |
| CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw, |
| bsh, NULL, NULL, 0, 0, NULL, xd->bd); |
| } else { |
| recon = (uint8_t *)recon16; |
| av1_convolve_2d_copy_sr(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh, NULL, |
| NULL, 0, 0, NULL); |
| } |
| |
| const PLANE_TYPE plane_type = get_plane_type(plane); |
| TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size, |
| cpi->common.reduced_tx_set_used); |
| av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon, |
| MAX_TX_SIZE, eob, |
| cpi->common.reduced_tx_set_used); |
| #if CONFIG_DIST_8X8 |
| if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) { |
| // Save decoded pixels for inter block in pd->pred to avoid |
| // block_8x8_rd_txfm_daala_dist() need to produce them |
| // by calling av1_inverse_transform_block() again. |
| const int pred_stride = block_size_wide[plane_bsize]; |
| const int pred_idx = (blk_row * pred_stride + blk_col) |
| << tx_size_wide_log2[0]; |
| int16_t *pred = &x->pred_luma[pred_idx]; |
| int i, j; |
| |
| if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { |
| for (j = 0; j < bsh; j++) |
| for (i = 0; i < bsw; i++) |
| pred[j * pred_stride + i] = |
| CONVERT_TO_SHORTPTR(recon)[j * MAX_TX_SIZE + i]; |
| } else { |
| for (j = 0; j < bsh; j++) |
| for (i = 0; i < bsw; i++) |
| pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i]; |
| } |
| } |
| #endif // CONFIG_DIST_8X8 |
| return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE, |
| blk_row, blk_col, plane_bsize, tx_bsize); |
| } |
| |
| static double get_mean(const int16_t *diff, int stride, int w, int h) { |
| double sum = 0.0; |
| for (int j = 0; j < h; ++j) { |
| for (int i = 0; i < w; ++i) { |
| sum += diff[j * stride + i]; |
| } |
| } |
| assert(w > 0 && h > 0); |
| return sum / (w * h); |
| } |
| |
| static double get_sse_norm(const int16_t *diff, int stride, int w, int h) { |
| double sum = 0.0; |
| for (int j = 0; j < h; ++j) { |
| for (int i = 0; i < w; ++i) { |
| const int err = diff[j * stride + i]; |
| sum += err * err; |
| } |
| } |
| assert(w > 0 && h > 0); |
| return sum / (w * h); |
| } |
| |
| static double get_sad_norm(const int16_t *diff, int stride, int w, int h) { |
| double sum = 0.0; |
| for (int j = 0; j < h; ++j) { |
| for (int i = 0; i < w; ++i) { |
| sum += abs(diff[j * stride + i]); |
| } |
| } |
| assert(w > 0 && h > 0); |
| return sum / (w * h); |
| } |
| |
| static void get_2x2_normalized_sses_and_sads( |
| const AV1_COMP *const cpi, BLOCK_SIZE tx_bsize, const uint8_t *const src, |
| int src_stride, const uint8_t *const dst, int dst_stride, |
| const int16_t *const src_diff, int diff_stride, double *const sse_norm_arr, |
| double *const sad_norm_arr) { |
| const BLOCK_SIZE tx_bsize_half = |
| get_partition_subsize(tx_bsize, PARTITION_SPLIT); |
| if (tx_bsize_half == BLOCK_INVALID) { // manually calculate stats |
| const int half_width = block_size_wide[tx_bsize] / 2; |
| const int half_height = block_size_high[tx_bsize] / 2; |
| for (int row = 0; row < 2; ++row) { |
| for (int col = 0; col < 2; ++col) { |
| const int16_t *const this_src_diff = |
| src_diff + row * half_height * diff_stride + col * half_width; |
| sse_norm_arr[row * 2 + col] = |
| get_sse_norm(this_src_diff, diff_stride, half_width, half_height); |
| sad_norm_arr[row * 2 + col] = |
| get_sad_norm(this_src_diff, diff_stride, half_width, half_height); |
| } |
| } |
| } else { // use function pointers to calculate stats |
| const int half_width = block_size_wide[tx_bsize_half]; |
| const int half_height = block_size_high[tx_bsize_half]; |
| const int num_samples_half = half_width * half_height; |
| for (int row = 0; row < 2; ++row) { |
| for (int col = 0; col < 2; ++col) { |
| const uint8_t *const this_src = |
| src + row * half_height * src_stride + col * half_width; |
| const uint8_t *const this_dst = |
| dst + row * half_height * dst_stride + col * half_width; |
| |
| unsigned int this_sse; |
| cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst, |
| dst_stride, &this_sse); |
| sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half; |
| |
| const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf( |
| this_src, src_stride, this_dst, dst_stride); |
| sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half; |
| } |
| } |
| } |
| } |
| |
| #if CONFIG_COLLECT_RD_STATS |
| // NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values |
| // 0: Do not collect any RD stats |
| // 1: Collect RD stats for transform units |
| // 2: Collect RD stats for partition units |
| static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, |
| const RD_STATS *const rd_stats, int blk_row, |
| int blk_col, BLOCK_SIZE plane_bsize, |
| TX_SIZE tx_size, TX_TYPE tx_type) { |
| if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return; |
| |
| // Generate small sample to restrict output size. |
| static unsigned int seed = 21743; |
| if (lcg_rand16(&seed) % 100 > 0) return; |
|