blob: 51cb8607d6df67ba63da094e7cbe504ace1234d2 [file] [log] [blame]
/*
* Copyright (c) 2021, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 3-Clause Clear License
* and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
* License was not distributed with this source code in the LICENSE file, you
* can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
* Alliance for Open Media Patent License 1.0 was not distributed with this
* source code in the PATENTS file, you can obtain it at
* aomedia.org/license/patent-license/.
*
*/
#include <math.h>
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
#include "config/aom_scale_rtcd.h"
#include "aom_mem/aom_mem.h"
#include "av1/common/av1_common_int.h"
#include "av1/common/resize.h"
#include "av1/common/restoration.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_mem/aom_mem.h"
#include "aom_ports/mem.h"
#if CONFIG_PC_WIENER
#include "av1/common/pc_wiener_filters.h"
#endif // CONFIG_PC_WIENER
#if CONFIG_WIENER_NONSEP
#define AOM_WIENERNS_COEFF(p, b, m, k) \
{ (b) + (p)-6, (m) * (1 << ((p)-6)), k }
#define AOM_MAKE_WIENERNS_CONFIG(prec, config, coeff) \
{ \
{ (prec), sizeof(config) / sizeof(config[0]), 0, (config), NULL, 0, 1 }, \
sizeof(coeff) / sizeof(coeff[0]), (coeff) \
}
#define AOM_MAKE_WIENERNS_CONFIG2(prec, config, config2, coeff) \
{ \
{ (prec), \
sizeof(config) / sizeof(config[0]), \
sizeof(config2) / sizeof(config2[0]), \
(config), \
(config2), \
0, \
1 }, \
sizeof(coeff) / sizeof(coeff[0]), (coeff) \
}
///////////////////////////////////////////////////////////////////////////
// First filter configuration
///////////////////////////////////////////////////////////////////////////
const int wienerns_config_y[][3] = {
{ 1, 0, 0 }, { -1, 0, 0 }, { 0, 1, 1 }, { 0, -1, 1 }, { 2, 0, 2 },
{ -2, 0, 2 }, { 0, 2, 3 }, { 0, -2, 3 }, { 1, 1, 4 }, { -1, -1, 4 },
{ -1, 1, 5 }, { 1, -1, 5 }, { 2, 1, 6 }, { -2, -1, 6 }, { 2, -1, 7 },
{ -2, 1, 7 }, { 1, 2, 8 }, { -1, -2, 8 }, { 1, -2, 9 }, { -1, 2, 9 },
{ 3, 0, 10 }, { -3, 0, 10 }, { 0, 3, 11 }, { 0, -3, 11 },
#if USE_CENTER_WIENER_NONSEP
{ 0, 0, 12 },
#endif // USE_CENTER_WIENER_NONSEP
};
const int wienerns_config_uv_from_uv[][3] = {
{ 1, 0, 0 }, { -1, 0, 0 }, { 0, 1, 1 }, { 0, -1, 1 },
{ 1, 1, 2 }, { -1, -1, 2 }, { -1, 1, 3 }, { 1, -1, 3 },
{ 2, 0, 4 }, { -2, 0, 4 }, { 0, 2, 5 }, { 0, -2, 5 },
};
const int wienerns_config_uv_from_y[][3] = {
#if CONFIG_WIENER_NONSEP_CROSS_FILT
{ 1, 0, 6 }, { -1, 0, 6 }, { 0, 1, 7 }, { 0, -1, 7 },
{ 1, 1, 8 }, { -1, -1, 8 }, { -1, 1, 9 }, { 1, -1, 9 },
{ 2, 0, 10 }, { -2, 0, 10 }, { 0, 2, 11 }, { 0, -2, 11 },
#endif // CONFIG_WIENER_NONSEP_CROSS_FILT
};
const int wienerns_prec_bits_y = 7;
const int wienerns_coeff_y[][WIENERNS_COEFCFG_LEN] = {
#if CONFIG_LR_4PART_CODE
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 5, -12, 0),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 5, -12, 0),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 4, -7, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 4, -7, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 4, -8, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 4, -8, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 3, -4, 2),
#if USE_CENTER_WIENER_NONSEP
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 5, -16, 0),
#endif // USE_CENTER_WIENER_NONSEP
#else
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 5, -12, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 5, -12, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 4, -7, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 4, -7, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 4, -8, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 4, -8, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 3, -4, 2),
#if USE_CENTER_WIENER_NONSEP
AOM_WIENERNS_COEFF(wienerns_prec_bits_y, 5, -16, 3),
#endif // USE_CENTER_WIENER_NONSEP
#endif // CONFIG_LR_4PART_CODE
};
const int wienerns_prec_bits_uv = 7;
const int wienerns_coeff_uv[][WIENERNS_COEFCFG_LEN] = {
#if CONFIG_LR_4PART_CODE
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 5, -12, 0),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 5, -12, 0),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 4, -7, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 4, -7, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 4, -8, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 4, -8, 1),
#if CONFIG_WIENER_NONSEP_CROSS_FILT
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 4, -8, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 4, -8, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 3, -4, 2),
#endif // CONFIG_WIENER_NONSEP_CROSS_FILT
#else
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 5, -12, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 5, -12, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 4, -7, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 4, -7, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 4, -8, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 4, -8, 3),
#if CONFIG_WIENER_NONSEP_CROSS_FILT
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 4, -8, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 4, -8, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv, 3, -4, 2),
#endif // CONFIG_WIENER_NONSEP_CROSS_FILT
#endif // CONFIG_LR_4PART_CODE
};
const WienernsFilterConfigType wienerns_filter_y = AOM_MAKE_WIENERNS_CONFIG(
wienerns_prec_bits_y, wienerns_config_y, wienerns_coeff_y);
const WienernsFilterConfigType wienerns_filter_uv =
AOM_MAKE_WIENERNS_CONFIG2(wienerns_prec_bits_uv, wienerns_config_uv_from_uv,
wienerns_config_uv_from_y, wienerns_coeff_uv);
const WienernsFilterConfigPairType wienerns_filters_midqp = {
&wienerns_filter_y, &wienerns_filter_uv
};
///////////////////////////////////////////////////////////////////////////
// Second filter configuration
///////////////////////////////////////////////////////////////////////////
const int wienerns_config_y2[][3] = {
{ 1, 0, 0 }, { -1, 0, 0 }, { 0, 1, 1 }, { 0, -1, 1 }, { 2, 0, 2 },
{ -2, 0, 2 }, { 0, 2, 3 }, { 0, -2, 3 }, { 1, 1, 4 }, { -1, -1, 4 },
{ -1, 1, 5 }, { 1, -1, 5 }, { 2, 1, 6 }, { -2, -1, 6 }, { 2, -1, 7 },
{ -2, 1, 7 }, { 1, 2, 8 }, { -1, -2, 8 }, { 1, -2, 9 }, { -1, 2, 9 },
#if USE_CENTER_WIENER_NONSEP
{ 0, 0, 10 },
#endif // USE_CENTER_WIENER_NONSEP
};
const int wienerns_config_uv_from_uv2[][3] = {
{ 1, 0, 0 }, { -1, 0, 0 }, { 0, 1, 1 }, { 0, -1, 1 },
{ 1, 1, 2 }, { -1, -1, 2 }, { -1, 1, 3 }, { 1, -1, 3 },
{ 2, 0, 4 }, { -2, 0, 4 }, { 0, 2, 5 }, { 0, -2, 5 },
};
const int wienerns_config_uv_from_y2[][3] = {
#if CONFIG_WIENER_NONSEP_CROSS_FILT
{ 1, 0, 6 }, { -1, 0, 6 }, { 0, 1, 7 }, { 0, -1, 7 },
{ 1, 1, 8 }, { -1, -1, 8 }, { -1, 1, 9 }, { 1, -1, 9 },
{ 2, 0, 10 }, { -2, 0, 10 }, { 0, 2, 11 }, { 0, -2, 11 },
#endif // CONFIG_WIENER_NONSEP_CROSS_FILT
};
const int wienerns_prec_bits_y2 = 7;
const int wienerns_coeff_y2[][WIENERNS_COEFCFG_LEN] = {
#if CONFIG_LR_4PART_CODE
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 5, -12, 0),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 5, -12, 0),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 4, -7, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 4, -7, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 4, -8, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 4, -8, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 3, -4, 2),
#if USE_CENTER_WIENER_NONSEP
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 5, -16, 0),
#endif // USE_CENTER_WIENER_NONSEP
#else
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 5, -12, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 5, -12, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 4, -7, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 4, -7, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 4, -8, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 4, -8, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 3, -4, 2),
#if USE_CENTER_WIENER_NONSEP
AOM_WIENERNS_COEFF(wienerns_prec_bits_y2, 5, -16, 3),
#endif // USE_CENTER_WIENER_NONSEP
#endif // CONFIG_LR_4PART_CODE
};
const int wienerns_prec_bits_uv2 = 7;
const int wienerns_coeff_uv2[][WIENERNS_COEFCFG_LEN] = {
#if CONFIG_LR_4PART_CODE
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 5, -12, 0),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 5, -12, 0),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 4, -7, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 4, -7, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 4, -8, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 4, -8, 1),
#if CONFIG_WIENER_NONSEP_CROSS_FILT
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 4, -8, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 4, -8, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 3, -4, 2),
#endif // CONFIG_WIENER_NONSEP_CROSS_FILT
#else
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 5, -12, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 5, -12, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 4, -7, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 4, -7, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 4, -8, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 4, -8, 3),
#if CONFIG_WIENER_NONSEP_CROSS_FILT
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 4, -8, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 4, -8, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 3, -4, 2),
#endif // CONFIG_WIENER_NONSEP_CROSS_FILT
#endif // CONFIG_LR_4PART_CODE
};
const WienernsFilterConfigType wienerns_filter_y2 = AOM_MAKE_WIENERNS_CONFIG(
wienerns_prec_bits_y2, wienerns_config_y2, wienerns_coeff_y2);
const WienernsFilterConfigType wienerns_filter_uv2 = AOM_MAKE_WIENERNS_CONFIG2(
wienerns_prec_bits_uv2, wienerns_config_uv_from_uv2,
wienerns_config_uv_from_y2, wienerns_coeff_uv2);
const WienernsFilterConfigPairType wienerns_filters_highqp = {
&wienerns_filter_y2, &wienerns_filter_uv2
};
///////////////////////////////////////////////////////////////////////////
// Third filter configuration
///////////////////////////////////////////////////////////////////////////
const int wienerns_config_y3[][3] = {
{ 1, 0, 0 }, { -1, 0, 0 }, { 0, 1, 1 }, { 0, -1, 1 }, { 2, 0, 2 },
{ -2, 0, 2 }, { 0, 2, 3 }, { 0, -2, 3 }, { 1, 1, 4 }, { -1, -1, 4 },
{ -1, 1, 5 }, { 1, -1, 5 }, { 2, 1, 6 }, { -2, -1, 6 }, { 2, -1, 7 },
{ -2, 1, 7 }, { 1, 2, 8 }, { -1, -2, 8 }, { 1, -2, 9 }, { -1, 2, 9 },
{ 3, 0, 10 }, { -3, 0, 10 }, { 0, 3, 11 }, { 0, -3, 11 }, { 2, 2, 12 },
{ -2, -2, 12 }, { -2, 2, 13 }, { 2, -2, 13 },
#if USE_CENTER_WIENER_NONSEP
{ 0, 0, 14 },
#endif // USE_CENTER_WIENER_NONSEP
};
const int wienerns_config_uv_from_uv3[][3] = {
{ 1, 0, 0 }, { -1, 0, 0 }, { 0, 1, 1 }, { 0, -1, 1 },
{ 1, 1, 2 }, { -1, -1, 2 }, { -1, 1, 3 }, { 1, -1, 3 },
{ 2, 0, 4 }, { -2, 0, 4 }, { 0, 2, 5 }, { 0, -2, 5 },
};
const int wienerns_config_uv_from_y3[][3] = {
#if CONFIG_WIENER_NONSEP_CROSS_FILT
{ 1, 0, 6 }, { -1, 0, 6 }, { 0, 1, 7 }, { 0, -1, 7 },
{ 1, 1, 8 }, { -1, -1, 8 }, { -1, 1, 9 }, { 1, -1, 9 },
{ 2, 0, 10 }, { -2, 0, 10 }, { 0, 2, 11 }, { 0, -2, 11 },
#endif // CONFIG_WIENER_NONSEP_CROSS_FILT
};
const int wienerns_prec_bits_y3 = 7;
const int wienerns_coeff_y3[][WIENERNS_COEFCFG_LEN] = {
#if CONFIG_LR_4PART_CODE
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 5, -12, 0),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 5, -12, 0),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 4, -7, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 4, -7, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 4, -8, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 4, -8, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 3, -4, 2),
#if USE_CENTER_WIENER_NONSEP
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 5, -16, 0),
#endif // USE_CENTER_WIENER_NONSEP
#else
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 5, -12, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 5, -12, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 4, -7, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 4, -7, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 4, -8, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 4, -8, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 3, -4, 2),
#if USE_CENTER_WIENER_NONSEP
AOM_WIENERNS_COEFF(wienerns_prec_bits_y3, 5, -16, 3),
#endif // USE_CENTER_WIENER_NONSEP
#endif // CONFIG_LR_4PART_CODE
};
const int wienerns_prec_bits_uv3 = 7;
const int wienerns_coeff_uv3[][WIENERNS_COEFCFG_LEN] = {
#if CONFIG_LR_4PART_CODE
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 5, -12, 0),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 5, -12, 0),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 4, -7, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 4, -7, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 4, -8, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv2, 4, -8, 1),
#if CONFIG_WIENER_NONSEP_CROSS_FILT
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 4, -8, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 4, -8, 1),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 3, -4, 2),
#endif // CONFIG_WIENER_NONSEP_CROSS_FILT
#else
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 5, -12, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 5, -12, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 4, -7, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 4, -7, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 4, -8, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 4, -8, 3),
#if CONFIG_WIENER_NONSEP_CROSS_FILT
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 4, -8, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 4, -8, 3),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 3, -4, 2),
AOM_WIENERNS_COEFF(wienerns_prec_bits_uv3, 3, -4, 2),
#endif // CONFIG_WIENER_NONSEP_CROSS_FILT
#endif // CONFIG_LR_4PART_CODE
};
const WienernsFilterConfigType wienerns_filter_y3 = AOM_MAKE_WIENERNS_CONFIG(
wienerns_prec_bits_y3, wienerns_config_y3, wienerns_coeff_y3);
const WienernsFilterConfigType wienerns_filter_uv3 = AOM_MAKE_WIENERNS_CONFIG2(
wienerns_prec_bits_uv3, wienerns_config_uv_from_uv3,
wienerns_config_uv_from_y3, wienerns_coeff_uv3);
const WienernsFilterConfigPairType wienerns_filters_lowqp = {
&wienerns_filter_y3, &wienerns_filter_uv3
};
#endif // CONFIG_WIENER_NONSEP
// The 's' values are calculated based on original 'r' and 'e' values in the
// spec using GenSgrprojVtable().
// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
{ { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
{ { 2, 1 }, { 93, 1618 } }, { { 2, 1 }, { 80, 1438 } },
{ { 2, 1 }, { 70, 1295 } }, { { 2, 1 }, { 58, 1177 } },
{ { 2, 1 }, { 47, 1079 } }, { { 2, 1 }, { 37, 996 } },
{ { 2, 1 }, { 30, 925 } }, { { 2, 1 }, { 25, 863 } },
{ { 0, 1 }, { -1, 2589 } }, { { 0, 1 }, { -1, 1618 } },
{ { 0, 1 }, { -1, 1177 } }, { { 0, 1 }, { -1, 925 } },
{ { 2, 0 }, { 56, -1 } }, { { 2, 0 }, { 22, -1 } },
};
AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
AV1PixelRect rect;
int ss_x = is_uv && cm->seq_params.subsampling_x;
int ss_y = is_uv && cm->seq_params.subsampling_y;
rect.top = 0;
rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
rect.left = 0;
rect.right = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
return rect;
}
// Count horizontal or vertical units per tile (use a width or height for
// tile_size, respectively). We basically want to divide the tile size by the
// size of a restoration unit. Rather than rounding up unconditionally as you
// might expect, we round to nearest, which models the way a right or bottom
// restoration unit can extend to up to 150% its normal width or height. The
// max with 1 is to deal with tiles that are smaller than half of a restoration
// unit.
int av1_lr_count_units_in_tile(int unit_size, int tile_size) {
return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
}
void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
int is_uv) {
// We need to allocate enough space for restoration units to cover the
// largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
// top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
// to do the computation ourselves, iterating over the tiles and keeping
// track of the largest width and height, then upscaling.
const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
const int max_tile_w = tile_rect.right - tile_rect.left;
const int max_tile_h = tile_rect.bottom - tile_rect.top;
// To calculate hpertile and vpertile (horizontal and vertical units per
// tile), we basically want to divide the largest tile width or height by the
// size of a restoration unit. Rather than rounding up unconditionally as you
// might expect, we round to nearest, which models the way a right or bottom
// restoration unit can extend to up to 150% its normal width or height. The
// max with 1 is to deal with tiles that are smaller than half of a
// restoration unit.
const int unit_size = rsi->restoration_unit_size;
const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w);
const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h);
rsi->units_per_tile = hpertile * vpertile;
rsi->horz_units_per_tile = hpertile;
rsi->vert_units_per_tile = vpertile;
const int ntiles = 1;
const int nunits = ntiles * rsi->units_per_tile;
aom_free(rsi->unit_info);
CHECK_MEM_ERROR(cm, rsi->unit_info,
(RestorationUnitInfo *)aom_memalign(
16, sizeof(*rsi->unit_info) * nunits));
}
void av1_free_restoration_struct(RestorationInfo *rst_info) {
aom_free(rst_info->unit_info);
rst_info->unit_info = NULL;
}
#if CONFIG_CNN_GUIDED_QUADTREE
void av1_alloc_quadtree_struct(struct AV1Common *cm, QUADInfo *quad_info) {
int split_size;
int A_size;
const int quadtree_unit_size = 512 >> cm->use_quad_level;
// cm->cur_quad_info->unit_size = quadtree_unit_size;
// cm->postcnn_quad_info->unit_size = quadtree_unit_size;
// int quadtree_unit_size = cm->cur_quad_info->unit_size;
YV12_BUFFER_CONFIG *pcPicYuvRec = &cm->cur_frame->buf;
int height = pcPicYuvRec->y_height;
int width = pcPicYuvRec->y_width;
int regular_height_num = (int)floor(((float)height) / quadtree_unit_size);
int regular_width_num = (int)floor(((float)width) / quadtree_unit_size);
int all_num = (int)ceil(((float)width) / quadtree_unit_size) *
(int)ceil(((float)height) / quadtree_unit_size);
int regularblock_num = regular_height_num * regular_width_num;
int un_regularblock_num = all_num - regularblock_num;
split_size = regularblock_num * 2; // every split way need two bits
CHECK_MEM_ERROR(cm, quad_info->split_info,
(QUADSplitInfo *)aom_memalign(
16, sizeof(*quad_info->split_info) * split_size));
A_size = regularblock_num * 4 + un_regularblock_num;
CHECK_MEM_ERROR(
cm, quad_info->unit_info,
(QUADUnitInfo *)aom_memalign(16, sizeof(*quad_info->unit_info) * A_size));
}
void av1_free_quadtree_struct(QUADInfo *quad_info) {
if (quad_info->unit_info != NULL) {
aom_free(quad_info->unit_info);
quad_info->unit_info = NULL;
}
if (quad_info->split_info != NULL) {
aom_free(quad_info->split_info);
quad_info->split_info = NULL;
}
quad_info->unit_size = 0;
quad_info->is_write = 0;
quad_info->split_info_length = 0;
quad_info->unit_info_length = 0;
quad_info->split_info_index = 0;
quad_info->unit_info_index = 0;
}
#endif // CONFIG_CNN_GUIDED_QUADTREE
#if 0
// Pair of values for each sgrproj parameter:
// Index 0 corresponds to r[0], e[0]
// Index 1 corresponds to r[1], e[1]
int sgrproj_mtable[SGRPROJ_PARAMS][2];
static void GenSgrprojVtable() {
for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
const sgr_params_type *const params = &av1_sgr_params[i];
for (int j = 0; j < 2; ++j) {
const int e = params->e[j];
const int r = params->r[j];
if (r == 0) { // filter is disabled
sgrproj_mtable[i][j] = -1; // mark invalid
} else { // filter is enabled
const int n = (2 * r + 1) * (2 * r + 1);
const int n2e = n * n * e;
assert(n2e != 0);
sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
}
}
}
}
#endif
void av1_loop_restoration_precal() {
#if 0
GenSgrprojVtable();
#endif
}
static void extend_frame_highbd(uint16_t *data, int width, int height,
int stride, int border_horz, int border_vert) {
uint16_t *data_p;
int i, j;
for (i = 0; i < height; ++i) {
data_p = data + i * stride;
for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
}
data_p = data - border_horz;
for (i = -border_vert; i < 0; ++i) {
memcpy(data_p + i * stride, data_p,
(width + 2 * border_horz) * sizeof(uint16_t));
}
for (i = height; i < height + border_vert; ++i) {
memcpy(data_p + i * stride, data_p + (height - 1) * stride,
(width + 2 * border_horz) * sizeof(uint16_t));
}
}
static void copy_tile_highbd(int width, int height, const uint16_t *src,
int src_stride, uint16_t *dst, int dst_stride) {
for (int i = 0; i < height; ++i)
memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
}
void av1_extend_frame(uint8_t *data, int width, int height, int stride,
int border_horz, int border_vert) {
extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
border_horz, border_vert);
}
static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride) {
copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
CONVERT_TO_SHORTPTR(dst), dst_stride);
}
#define REAL_PTR(d) ((uint8_t *)CONVERT_TO_SHORTPTR(d))
// With striped loop restoration, the filtering for each 64-pixel stripe gets
// most of its input from the output of CDEF (stored in data8), but we need to
// fill out a border of 3 pixels above/below the stripe according to the
// following
// rules:
//
// * At a frame boundary, we copy the outermost row of CDEF pixels three times.
// This extension is done by a call to av1_extend_frame() at the start of the
// loop restoration process, so the value of copy_above/copy_below doesn't
// strictly matter. However, by setting *copy_above = *copy_below = 1 whenever
// loop filtering across tiles is disabled, we can allow
// {setup,restore}_processing_stripe_boundary to assume that the top/bottom
// data has always been copied, simplifying the behaviour at the left and
// right edges of tiles.
//
// * If we're at a tile boundary and loop filtering across tiles is enabled,
// then there is a logical stripe which is 64 pixels high, but which is split
// into an 8px high and a 56px high stripe so that the processing (and
// coefficient set usage) can be aligned to tiles.
// In this case, we use the 3 rows of CDEF output across the boundary for
// context; this corresponds to leaving the frame buffer as-is.
//
// * If we're at a tile boundary and loop filtering across tiles is disabled,
// then we take the outermost row of CDEF pixels *within the current tile*
// and copy it three times. Thus we behave exactly as if the tile were a full
// frame.
//
// * Otherwise, we're at a stripe boundary within a tile. In that case, we
// take 2 rows of deblocked pixels and extend them to 3 rows of context.
//
// The distinction between the latter two cases is handled by the
// av1_loop_restoration_save_boundary_lines() function, so here we just need
// to decide if we're overwriting the above/below boundary pixels or not.
static void get_stripe_boundary_info(const RestorationTileLimits *limits,
const AV1PixelRect *tile_rect, int ss_y,
int *copy_above, int *copy_below) {
*copy_above = 1;
*copy_below = 1;
const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
const int first_stripe_in_tile = (limits->v_start == tile_rect->top);
const int this_stripe_height =
full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
const int last_stripe_in_tile =
(limits->v_start + this_stripe_height >= tile_rect->bottom);
if (first_stripe_in_tile) *copy_above = 0;
if (last_stripe_in_tile) *copy_below = 0;
}
// Overwrite the border pixels around a processing stripe so that the conditions
// listed above get_stripe_boundary_info() are preserved.
// We save the pixels which get overwritten into a temporary buffer, so that
// they can be restored by restore_processing_stripe_boundary() after we've
// processed the stripe.
//
// limits gives the rectangular limits of the remaining stripes for the current
// restoration unit. rsb is the stored stripe boundaries (taken from either
// deblock or CDEF output as necessary).
//
// tile_rect is the limits of the current tile and tile_stripe0 is the index of
// the first stripe in this tile (needed to convert the tile-relative stripe
// index we get from limits into something we can look up in rsb).
static void setup_processing_stripe_boundary(
const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
int rsb_row, int h, uint8_t *data8, int data_stride,
RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
// Offsets within the line buffers. The buffer logically starts at column
// -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
// has column x0 in the buffer.
const int buf_stride = rsb->stripe_boundary_stride;
const int buf_x0_off = limits->h_start;
const int line_width =
(limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
const int line_size = line_width << 1;
const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
// Replace RESTORATION_BORDER pixels above the top of the stripe
// We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
// to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
// duplicating the topmost of the 2 lines (see the AOMMAX call when
// calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
//
// Special case: If we're at the top of a tile, which isn't on the topmost
// tile row, and we're allowed to loop filter across tiles, then we have a
// logical 64-pixel-high stripe which has been split into an 8-pixel high
// stripe and a 56-pixel high stripe (the current one). So, in this case,
// we want to leave the boundary alone!
if (!opt) {
if (copy_above) {
uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
for (int i = -RESTORATION_BORDER; i < 0; ++i) {
const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
const int buf_off = buf_x0_off + buf_row * buf_stride;
const uint8_t *buf = rsb->stripe_boundary_above + (buf_off << 1);
uint8_t *dst8 = data8_tl + i * data_stride;
// Save old pixels, then replace with data from stripe_boundary_above
memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER], REAL_PTR(dst8),
line_size);
memcpy(REAL_PTR(dst8), buf, line_size);
}
}
// Replace RESTORATION_BORDER pixels below the bottom of the stripe.
// The second buffer row is repeated, so src_row gets the values 0, 1, 1
// for i = 0, 1, 2.
if (copy_below) {
const int stripe_end = limits->v_start + h;
uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
for (int i = 0; i < RESTORATION_BORDER; ++i) {
const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
const int buf_off = buf_x0_off + buf_row * buf_stride;
const uint8_t *src = rsb->stripe_boundary_below + (buf_off << 1);
uint8_t *dst8 = data8_bl + i * data_stride;
// Save old pixels, then replace with data from stripe_boundary_below
memcpy(rlbs->tmp_save_below[i], REAL_PTR(dst8), line_size);
memcpy(REAL_PTR(dst8), src, line_size);
}
}
} else {
if (copy_above) {
uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
// Only save and overwrite i=-RESTORATION_BORDER line.
uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
// Save old pixels, then replace with data from stripe_boundary_above
memcpy(rlbs->tmp_save_above[0], REAL_PTR(dst8), line_size);
memcpy(REAL_PTR(dst8),
REAL_PTR(data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
line_size);
}
if (copy_below) {
const int stripe_end = limits->v_start + h;
uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
// Only save and overwrite i=2 line.
uint8_t *dst8 = data8_bl + 2 * data_stride;
// Save old pixels, then replace with data from stripe_boundary_below
memcpy(rlbs->tmp_save_below[2], REAL_PTR(dst8), line_size);
memcpy(REAL_PTR(dst8), REAL_PTR(data8_bl + (2 - 1) * data_stride),
line_size);
}
}
}
// This function restores the boundary lines modified by
// setup_processing_stripe_boundary.
//
// Note: We need to be careful when handling the corners of the processing
// unit, because (eg.) the top-left corner is considered to be part of
// both the left and top borders. This means that, depending on the
// loop_filter_across_tiles_enabled flag, the corner pixels might get
// overwritten twice, once as part of the "top" border and once as part
// of the "left" border (or similar for other corners).
//
// Everything works out fine as long as we make sure to reverse the order
// when restoring, ie. we need to restore the left/right borders followed
// by the top/bottom borders.
static void restore_processing_stripe_boundary(
const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
int h, uint8_t *data8, int data_stride, int copy_above, int copy_below,
int opt) {
const int line_width =
(limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
const int line_size = line_width << 1;
const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
if (!opt) {
if (copy_above) {
uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
for (int i = -RESTORATION_BORDER; i < 0; ++i) {
uint8_t *dst8 = data8_tl + i * data_stride;
memcpy(REAL_PTR(dst8), rlbs->tmp_save_above[i + RESTORATION_BORDER],
line_size);
}
}
if (copy_below) {
const int stripe_bottom = limits->v_start + h;
uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
for (int i = 0; i < RESTORATION_BORDER; ++i) {
if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
uint8_t *dst8 = data8_bl + i * data_stride;
memcpy(REAL_PTR(dst8), rlbs->tmp_save_below[i], line_size);
}
}
} else {
if (copy_above) {
uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
// Only restore i=-RESTORATION_BORDER line.
uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
memcpy(REAL_PTR(dst8), rlbs->tmp_save_above[0], line_size);
}
if (copy_below) {
const int stripe_bottom = limits->v_start + h;
uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
// Only restore i=2 line.
if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
uint8_t *dst8 = data8_bl + 2 * data_stride;
memcpy(REAL_PTR(dst8), rlbs->tmp_save_below[2], line_size);
}
}
}
}
/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
over the input. The window is of size (2r + 1)x(2r + 1), and we
specialize to r = 1, 2, 3. A default function is used for r > 3.
Each loop follows the same format: We keep a window's worth of input
in individual variables and select data out of that as appropriate.
*/
static void boxsum1(int32_t *src, int width, int height, int src_stride,
int sqr, int32_t *dst, int dst_stride) {
int i, j, a, b, c;
assert(width > 2 * SGRPROJ_BORDER_HORZ);
assert(height > 2 * SGRPROJ_BORDER_VERT);
// Vertical sum over 3-pixel regions, from src into dst.
if (!sqr) {
for (j = 0; j < width; ++j) {
a = src[j];
b = src[src_stride + j];
c = src[2 * src_stride + j];
dst[j] = a + b;
for (i = 1; i < height - 2; ++i) {
// Loop invariant: At the start of each iteration,
// a = src[(i - 1) * src_stride + j]
// b = src[(i ) * src_stride + j]
// c = src[(i + 1) * src_stride + j]
dst[i * dst_stride + j] = a + b + c;
a = b;
b = c;
c = src[(i + 2) * src_stride + j];
}
dst[i * dst_stride + j] = a + b + c;
dst[(i + 1) * dst_stride + j] = b + c;
}
} else {
for (j = 0; j < width; ++j) {
a = src[j] * src[j];
b = src[src_stride + j] * src[src_stride + j];
c = src[2 * src_stride + j] * src[2 * src_stride + j];
dst[j] = a + b;
for (i = 1; i < height - 2; ++i) {
dst[i * dst_stride + j] = a + b + c;
a = b;
b = c;
c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
}
dst[i * dst_stride + j] = a + b + c;
dst[(i + 1) * dst_stride + j] = b + c;
}
}
// Horizontal sum over 3-pixel regions of dst
for (i = 0; i < height; ++i) {
a = dst[i * dst_stride];
b = dst[i * dst_stride + 1];
c = dst[i * dst_stride + 2];
dst[i * dst_stride] = a + b;
for (j = 1; j < width - 2; ++j) {
// Loop invariant: At the start of each iteration,
// a = src[i * src_stride + (j - 1)]
// b = src[i * src_stride + (j )]
// c = src[i * src_stride + (j + 1)]
dst[i * dst_stride + j] = a + b + c;
a = b;
b = c;
c = dst[i * dst_stride + (j + 2)];
}
dst[i * dst_stride + j] = a + b + c;
dst[i * dst_stride + (j + 1)] = b + c;
}
}
static void boxsum2(int32_t *src, int width, int height, int src_stride,
int sqr, int32_t *dst, int dst_stride) {
int i, j, a, b, c, d, e;
assert(width > 2 * SGRPROJ_BORDER_HORZ);
assert(height > 2 * SGRPROJ_BORDER_VERT);
// Vertical sum over 5-pixel regions, from src into dst.
if (!sqr) {
for (j = 0; j < width; ++j) {
a = src[j];
b = src[src_stride + j];
c = src[2 * src_stride + j];
d = src[3 * src_stride + j];
e = src[4 * src_stride + j];
dst[j] = a + b + c;
dst[dst_stride + j] = a + b + c + d;
for (i = 2; i < height - 3; ++i) {
// Loop invariant: At the start of each iteration,
// a = src[(i - 2) * src_stride + j]
// b = src[(i - 1) * src_stride + j]
// c = src[(i ) * src_stride + j]
// d = src[(i + 1) * src_stride + j]
// e = src[(i + 2) * src_stride + j]
dst[i * dst_stride + j] = a + b + c + d + e;
a = b;
b = c;
c = d;
d = e;
e = src[(i + 3) * src_stride + j];
}
dst[i * dst_stride + j] = a + b + c + d + e;
dst[(i + 1) * dst_stride + j] = b + c + d + e;
dst[(i + 2) * dst_stride + j] = c + d + e;
}
} else {
for (j = 0; j < width; ++j) {
a = src[j] * src[j];
b = src[src_stride + j] * src[src_stride + j];
c = src[2 * src_stride + j] * src[2 * src_stride + j];
d = src[3 * src_stride + j] * src[3 * src_stride + j];
e = src[4 * src_stride + j] * src[4 * src_stride + j];
dst[j] = a + b + c;
dst[dst_stride + j] = a + b + c + d;
for (i = 2; i < height - 3; ++i) {
dst[i * dst_stride + j] = a + b + c + d + e;
a = b;
b = c;
c = d;
d = e;
e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
}
dst[i * dst_stride + j] = a + b + c + d + e;
dst[(i + 1) * dst_stride + j] = b + c + d + e;
dst[(i + 2) * dst_stride + j] = c + d + e;
}
}
// Horizontal sum over 5-pixel regions of dst
for (i = 0; i < height; ++i) {
a = dst[i * dst_stride];
b = dst[i * dst_stride + 1];
c = dst[i * dst_stride + 2];
d = dst[i * dst_stride + 3];
e = dst[i * dst_stride + 4];
dst[i * dst_stride] = a + b + c;
dst[i * dst_stride + 1] = a + b + c + d;
for (j = 2; j < width - 3; ++j) {
// Loop invariant: At the start of each iteration,
// a = src[i * src_stride + (j - 2)]
// b = src[i * src_stride + (j - 1)]
// c = src[i * src_stride + (j )]
// d = src[i * src_stride + (j + 1)]
// e = src[i * src_stride + (j + 2)]
dst[i * dst_stride + j] = a + b + c + d + e;
a = b;
b = c;
c = d;
d = e;
e = dst[i * dst_stride + (j + 3)];
}
dst[i * dst_stride + j] = a + b + c + d + e;
dst[i * dst_stride + (j + 1)] = b + c + d + e;
dst[i * dst_stride + (j + 2)] = c + d + e;
}
}
static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
int sqr, int32_t *dst, int dst_stride) {
if (r == 1)
boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
else if (r == 2)
boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
else
assert(0 && "Invalid value of r in self-guided filter");
}
void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
if (params->r[0] == 0) {
xq[0] = 0;
xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
} else if (params->r[1] == 0) {
xq[0] = xqd[0];
xq[1] = 0;
} else {
xq[0] = xqd[0];
xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
}
}
const int32_t av1_x_by_xplus1[256] = {
// Special case: Map 0 -> 1 (corresponding to a value of 1/256)
// instead of 0. See comments in selfguided_restoration_internal() for why
1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
256,
};
const int32_t av1_one_by_x[MAX_NELEM] = {
4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164,
};
static void calculate_intermediate_result(int32_t *dgd, int width, int height,
int dgd_stride, int bit_depth,
int sgr_params_idx, int radius_idx,
int pass, int32_t *A, int32_t *B) {
const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
const int r = params->r[radius_idx];
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
// Adjusting the stride of A and B here appears to avoid bad cache effects,
// leading to a significant speed improvement.
// We also align the stride to a multiple of 16 bytes, for consistency
// with the SIMD version of this function.
int buf_stride = ((width_ext + 3) & ~3) + 16;
const int step = pass == 0 ? 1 : 2;
int i, j;
assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
"Need SGRPROJ_BORDER_* >= r+1");
boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
// Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
// for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
for (i = -1; i < height + 1; i += step) {
for (j = -1; j < width + 1; ++j) {
const int k = i * buf_stride + j;
const int n = (2 * r + 1) * (2 * r + 1);
// a < 2^16 * n < 2^22 regardless of bit depth
uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
// b < 2^8 * n < 2^14 regardless of bit depth
uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
// Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
// and p itself satisfies p < 2^14 * n^2 < 2^26.
// This bound on p is due to:
// https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
//
// Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
// This is an artefact of rounding, and can only happen if all pixels
// are (almost) identical, so in this case we saturate to p=0.
uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
const uint32_t s = params->s[radius_idx];
// p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
// as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
// (this holds even after accounting for the rounding in s)
const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
// Note: We have to be quite careful about the value of A[k].
// This is used as a blend factor between individual pixel values and the
// local mean. So it logically has a range of [0, 256], including both
// endpoints.
//
// This is a pain for hardware, as we'd like something which can be stored
// in exactly 8 bits.
// Further, in the calculation of B[k] below, if z == 0 and r == 2,
// then A[k] "should be" 0. But then we can end up setting B[k] to a value
// slightly above 2^(8 + bit depth), due to rounding in the value of
// av1_one_by_x[25-1].
//
// Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
// This fixes the above issues (256 - A[k] fits in a uint8, and we can't
// overflow), without significantly affecting the final result: z == 0
// implies that the image is essentially "flat", so the local mean and
// individual pixel values are very similar.
//
// Note that saturating on the other side, ie. requring A[k] <= 255,
// would be a bad idea, as that corresponds to the case where the image
// is very variable, when we want to preserve the local pixel value as
// much as possible.
A[k] = av1_x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256]
// SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
// av1_one_by_x[n - 1] = round(2^12 / n)
// => the product here is < 2^(20 + bit_depth) <= 2^32,
// and B[k] is set to a value < 2^(8 + bit depth)
// This holds even with the rounding in av1_one_by_x and in the overall
// result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
(uint32_t)B[k] *
(uint32_t)av1_one_by_x[n - 1],
SGRPROJ_RECIP_BITS);
}
}
}
static void selfguided_restoration_fast_internal(
int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
const int r = params->r[radius_idx];
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
// Adjusting the stride of A and B here appears to avoid bad cache effects,
// leading to a significant speed improvement.
// We also align the stride to a multiple of 16 bytes, for consistency
// with the SIMD version of this function.
int buf_stride = ((width_ext + 3) & ~3) + 16;
int32_t A_[RESTORATION_PROC_UNIT_PELS];
int32_t B_[RESTORATION_PROC_UNIT_PELS];
int32_t *A = A_;
int32_t *B = B_;
int i, j;
calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
sgr_params_idx, radius_idx, 1, A, B);
A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
// Use the A[] and B[] arrays to calculate the filtered image
(void)r;
assert(r == 2);
for (i = 0; i < height; ++i) {
if (!(i & 1)) { // even row
for (j = 0; j < width; ++j) {
const int k = i * buf_stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 5;
const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
(A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
5;
const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
(B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
5;
const int32_t v = a * dgd[l] + b;
dst[m] =
ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
} else { // odd row
for (j = 0; j < width; ++j) {
const int k = i * buf_stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 4;
const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
const int32_t v = a * dgd[l] + b;
dst[m] =
ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
}
}
}
static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
int dgd_stride, int32_t *dst,
int dst_stride, int bit_depth,
int sgr_params_idx,
int radius_idx) {
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
// Adjusting the stride of A and B here appears to avoid bad cache effects,
// leading to a significant speed improvement.
// We also align the stride to a multiple of 16 bytes, for consistency
// with the SIMD version of this function.
int buf_stride = ((width_ext + 3) & ~3) + 16;
int32_t A_[RESTORATION_PROC_UNIT_PELS];
int32_t B_[RESTORATION_PROC_UNIT_PELS];
int32_t *A = A_;
int32_t *B = B_;
int i, j;
calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
sgr_params_idx, radius_idx, 0, A, B);
A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
// Use the A[] and B[] arrays to calculate the filtered image
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
const int k = i * buf_stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 5;
const int32_t a =
(A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
4 +
(A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
3;
const int32_t b =
(B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
4 +
(B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
3;
const int32_t v = a * dgd[l] + b;
dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
}
}
int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
int dgd_stride, int32_t *flt0, int32_t *flt1,
int flt_stride, int sgr_params_idx,
int bit_depth) {
int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
int32_t *dgd32 =
dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
}
}
const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
// If params->r == 0 we skip the corresponding filter. We only allow one of
// the radii to be 0, as having both equal to 0 would be equivalent to
// skipping SGR entirely.
assert(!(params->r[0] == 0 && params->r[1] == 0));
if (params->r[0] > 0)
selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
flt0, flt_stride, bit_depth,
sgr_params_idx, 0);
if (params->r[1] > 0)
selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
flt_stride, bit_depth, sgr_params_idx, 1);
return 0;
}
void av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
int height, int stride, int eps,
const int *xqd, uint8_t *dst8,
int dst_stride, int32_t *tmpbuf,
int bit_depth) {
int32_t *flt0 = tmpbuf;
int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
assert(width * height <= RESTORATION_UNITPELS_MAX);
const int ret = av1_selfguided_restoration_c(
dat8, width, height, stride, flt0, flt1, width, eps, bit_depth);
(void)ret;
assert(!ret);
const sgr_params_type *const params = &av1_sgr_params[eps];
int xq[2];
av1_decode_xq(xqd, xq, params);
for (int i = 0; i < height; ++i) {
for (int j = 0; j < width; ++j) {
const int k = i * width + j;
uint8_t *dst8ij = dst8 + i * dst_stride + j;
const uint8_t *dat8ij = dat8 + i * stride + j;
const uint16_t pre_u = *CONVERT_TO_SHORTPTR(dat8ij);
const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
int32_t v = u << SGRPROJ_PRJ_BITS;
// If params->r == 0 then we skipped the filtering in
// av1_selfguided_restoration_c, i.e. flt[k] == u
if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
const int16_t w =
(int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
const uint16_t out = clip_pixel_highbd(w, bit_depth);
*CONVERT_TO_SHORTPTR(dst8ij) = out;
}
}
}
#if CONFIG_PC_WIENER
#if CONFIG_COMBINE_PC_NS_WIENER
// 7 x 7 combined filters.
#define MIN_ROW -3
#define MAX_ROW 3
#define MIN_COL -3
#define MAX_COL 3
#define MAX_NUM_TAPS ((MAX_ROW - MIN_ROW + 1) * (MAX_COL - MIN_COL + 1))
#define IMPOSSIBLE_TAP_POSITION -1
#define NS_TAP_POS 2
#define PC_TAP_POS 3
// Assumes Y combination only. Encapsulate x3 if chroma support is needed.
// Format of the inner four dimensions:
// offset-row, offset-col, filter1-tap-posn, filter2-tap-posn.
static int combined_tap_positions[MAX_NUM_TAPS][4] = { 0 };
// Number of taps in the combined filter.
static int combined_total_taps = 0;
static int32_t combined_filter[MAX_NUM_TAPS] = { 0 };
static int combined_tap_config[MAX_NUM_TAPS][3] = { 0 };
static NonsepFilterConfig combined_filter_config = {
0, 0, 0, combined_tap_config, NULL, 0, 0
};
// Correction factor to account for nsfilters filtering pixel differences.
static int32_t combined_filter_correction = 0;
// Useful in storing the two configs that have been combined to help skip
// set_combined_filter_tap_positions when it is not needed.
static int prev_ns_tap_config[MAX_NUM_TAPS][3] = { 0 };
static int prev_pc_tap_config[MAX_NUM_TAPS][3] = { 0 };
static bool is_config_same(const NonsepFilterConfig *filter_config,
const int (*prev_tap_config)[3]) {
assert(filter_config->num_pixels <= MAX_NUM_TAPS);
for (int k = 0; k < filter_config->num_pixels; ++k) {
for (int l = 0; l < 3; ++l) {
if (filter_config->config[k][l] != prev_tap_config[k][l]) return false;
}
}
return true;
}
static void copy_to_prev_tap_config(const NonsepFilterConfig *filter_config,
int (*prev_tap_config)[3]) {
assert(filter_config->num_pixels <= MAX_NUM_TAPS);
for (int k = 0; k < filter_config->num_pixels; ++k) {
for (int l = 0; l < 3; ++l) {
prev_tap_config[k][l] = filter_config->config[k][l];
}
}
}
// Checks if (row, col) exists in the filter_config and returns the matching
// filter-tap position.
static int get_matching_filter_position(const NonsepFilterConfig *filter_config,
int row, int col) {
int pos = IMPOSSIBLE_TAP_POSITION;
for (int k = 0; k < filter_config->num_pixels; ++k) {
if ((row == filter_config->config[k][NONSEP_ROW_ID]) &&
(col == filter_config->config[k][NONSEP_COL_ID])) {
pos = filter_config->config[k][NONSEP_BUF_POS];
assert(pos != IMPOSSIBLE_TAP_POSITION);
break;
}
}
return pos;
}
static bool is_combined_tap_positions_change_needed(
const NonsepFilterConfig *nsfilter_config,
const NonsepFilterConfig *pcfilter_config) {
return !is_config_same(nsfilter_config, prev_ns_tap_config) ||
!is_config_same(pcfilter_config, prev_pc_tap_config);
}
// Combines the tap positions from two configs (of two filters) into
// combined_tap_positions. Useful in deriving a config for the sum filter
// determined by summing the two filters.
static void set_combined_filter_tap_positions(
const NonsepFilterConfig *nsfilter_config,
const NonsepFilterConfig *pcfilter_config) {
// Check if we need to recalculate.
if (!is_combined_tap_positions_change_needed(nsfilter_config,
pcfilter_config))
return;
int total_taps = 0;
for (int r = MIN_ROW; r <= MAX_ROW; ++r) {
for (int c = MIN_COL; c <= MAX_COL; ++c) {
const int pos_ns = get_matching_filter_position(nsfilter_config, r, c);
const int pos_pc = get_matching_filter_position(pcfilter_config, r, c);
if (pos_ns == IMPOSSIBLE_TAP_POSITION &&
pos_pc == IMPOSSIBLE_TAP_POSITION)
continue;
combined_tap_positions[total_taps][NONSEP_ROW_ID] = r;
combined_tap_positions[total_taps][NONSEP_COL_ID] = c;
combined_tap_positions[total_taps][NS_TAP_POS] = pos_ns;
combined_tap_positions[total_taps][PC_TAP_POS] = pos_pc;
++total_taps;
}
}
combined_total_taps = total_taps;
copy_to_prev_tap_config(nsfilter_config, prev_ns_tap_config);
copy_to_prev_tap_config(pcfilter_config, prev_pc_tap_config);
}
// Adds the two filters pointed to by the configs. Assumes
// combined_tap_positions has been set.
static void add_filters(const NonsepFilterConfig *nsfilter_config,
const NonsepFilterConfig *pcfilter_config,
const int16_t *nsfilter, const int32_t *pcfilter,
const int32_t nsmultiplier,
const int32_t pcmultiplier) {
// TODO(oguleryuz): Add buffers for chroma.
assert(PC_WIENER_PROCESS_CHROMA == 0);
// Leave num_pixels2, config1, config2, strict_bounds as in initializer.
combined_filter_config.num_pixels = combined_total_taps;
combined_filter_correction = 0;
const int mult_room = PC_WIENER_MULT_ROOM;
// The sum filter should have the higher precision. Figure out how much
// shift is needed for each summand.
int ns_prec_shift = 0;
int pc_prec_shift = 0;
if (nsfilter_config->prec_bits > pcfilter_config->prec_bits) {
combined_filter_config.prec_bits = nsfilter_config->prec_bits + mult_room;
pc_prec_shift += nsfilter_config->prec_bits - pcfilter_config->prec_bits;
} else {
combined_filter_config.prec_bits = pcfilter_config->prec_bits + mult_room;
ns_prec_shift += pcfilter_config->prec_bits - nsfilter_config->prec_bits;
}
const int32_t ns_scale = nsmultiplier << ns_prec_shift;
const int32_t pc_scale = pcmultiplier << pc_prec_shift;
// After the addition combined taps are at cb bits where,
// cb = combined_filter_config.prec_bits - mult_room + PC_WIENER_PREC_FEATURE.
// Right shift by cb - combined_filter_config.prec_bits, i.e.,
// by PC_WIENER_PREC_FEATURE - mult_room to bring them down to
// combined_filter_config.prec_bits precision.
const int mult_shift = PC_WIENER_PREC_FEATURE - mult_room;
for (int k = 0; k < combined_total_taps; ++k) {
int32_t tap = 0;
const int ns_tap_posn = combined_tap_positions[k][NS_TAP_POS];
const int pc_tap_posn = combined_tap_positions[k][PC_TAP_POS];
if (ns_tap_posn == IMPOSSIBLE_TAP_POSITION)
tap = ROUND_POWER_OF_TWO_SIGNED(pc_scale * pcfilter[pc_tap_posn],
mult_shift);
else if (pc_tap_posn == IMPOSSIBLE_TAP_POSITION) {
tap = ROUND_POWER_OF_TWO_SIGNED(ns_scale * nsfilter[ns_tap_posn],
mult_shift);
combined_filter_correction += tap;
} else {
const int ns_tap = ns_scale * nsfilter[ns_tap_posn];
combined_filter_correction +=
ROUND_POWER_OF_TWO_SIGNED(ns_tap, mult_shift);
tap = ROUND_POWER_OF_TWO_SIGNED(pc_scale * pcfilter[pc_tap_posn] + ns_tap,
mult_shift);
}
combined_tap_config[k][NONSEP_ROW_ID] =
combined_tap_positions[k][NONSEP_ROW_ID];
combined_tap_config[k][NONSEP_COL_ID] =
combined_tap_positions[k][NONSEP_COL_ID];
combined_tap_config[k][NONSEP_BUF_POS] = k;
combined_filter[k] = tap;
}
}
#endif // CONFIG_COMBINE_PC_NS_WIENER
static int get_tskip_stride(const AV1_COMMON *cm, int plane) {
int height = cm->mi_params.mi_cols << MI_SIZE_LOG2;
int w = ((height + MAX_SB_SIZE - 1) >> MAX_SB_SIZE_LOG2) << MAX_SB_SIZE_LOG2;
w >>= ((plane == 0) ? 0 : cm->seq_params.subsampling_x);
return (w + MIN_TX_SIZE - 1) >> MIN_TX_SIZE_LOG2;
}
// TODO(oguleryuz): This should remain in sync with av1_convert_qindex_to_q.
static int get_qstep(int base_qindex, int bit_depth, int *shift) {
int base_shift = QUANT_TABLE_BITS;
switch (bit_depth) {
case AOM_BITS_8:
*shift = 2 + base_shift;
return av1_ac_quant_QTX(base_qindex, 0, bit_depth);
case AOM_BITS_10:
*shift = 4 + base_shift;
return av1_ac_quant_QTX(base_qindex, 0, bit_depth);
case AOM_BITS_12:
*shift = 6 + base_shift;
return av1_ac_quant_QTX(base_qindex, 0, bit_depth);
default:
assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
return -1;
}
}
// TODO(oguleryuz): These need to move into allocated line buffers accessible
// by enc/dec so that alloc/free cycles are reduced.
#define MAX_FEATURE_LENGTH PC_WIENER_FEATURE_LENGTH_LUMA
#define NUM_FEATURE_LINE_BUFFERS (NUM_PC_WIENER_FEATURES * MAX_FEATURE_LENGTH)
static int buffer_width = 0;
static int16_t *feature_line_buffers[NUM_FEATURE_LINE_BUFFERS] = { 0 };
static int *feature_sum_buffers[NUM_PC_WIENER_FEATURES] = { 0 };
static int16_t *tskip_sum_buffer = 0;
static int directional_feature_accumulator[NUM_PC_WIENER_FEATURES] = { 0 };
static int tskip_feature_accumulator = 0;
static int feature_normalizers[NUM_PC_WIENER_FEATURES + 1] = { 0 };
static void rotate_feature_line_buffers(int feature_len) {
assert(feature_len <= MAX_FEATURE_LENGTH);
for (int feature = 0; feature < NUM_PC_WIENER_FEATURES; ++feature) {
const int row_begin = feature * feature_len;
int16_t *buffer_0 = feature_line_buffers[row_begin];
for (int row = row_begin; row < row_begin + feature_len - 1; ++row) {
feature_line_buffers[row] = feature_line_buffers[row + 1];
}
feature_line_buffers[row_begin + feature_len - 1] = buffer_0;
}
}
// add this to the request.
// Calculates and accumulates the gradients over a window around row. If
// use_strict_bounds is false dgd must have valid data on this column extending
// for rows from [row_begin, row_end) where,
// row_begin = row - PC_WIENER_FEATURE_LENGTH / 2
// row_end = row + PC_WIENER_FEATURE_LENGTH / 2 + 1.
// This version of the routine assumes use_strict_bounds is false.
static void fill_directional_feature_buffers_highbd(int row, int buffer_row,
const uint16_t *dgd,
int dgd_stride, int width,
int feature_length,
bool use_strict_bounds) {
assert(use_strict_bounds == false);
const int col_begin = -feature_length / 2;
const int col_end = width + feature_length / 2;
const int buffer_row_0 = buffer_row;
const int buffer_row_1 = buffer_row_0 + feature_length;
const int buffer_row_2 = buffer_row_1 + feature_length;
const int buffer_row_3 = buffer_row_2 + feature_length;
int buffer_col = 0;
// TODO(oguleryuz): Reduce buffer sizes for downsampling.
#pragma GCC ivdep
for (int col = col_begin; col < col_end; ++col, ++buffer_col) {
feature_sum_buffers[0][buffer_col] -=
feature_line_buffers[buffer_row_0][buffer_col];
feature_sum_buffers[1][buffer_col] -=
feature_line_buffers[buffer_row_1][buffer_col];
feature_sum_buffers[2][buffer_col] -=
feature_line_buffers[buffer_row_2][buffer_col];
feature_sum_buffers[3][buffer_col] -=
feature_line_buffers[buffer_row_3][buffer_col];
}
buffer_col = 0;
#pragma GCC ivdep
for (int col = col_begin; col < col_end; ++col, ++buffer_col) {
const int dgd_id = row * dgd_stride + col;
const int prev_row = dgd_id - dgd_stride;
const int next_row = dgd_id + dgd_stride;
// D V A
// H O H
// A V D
const int16_t base_value = 2 * dgd[dgd_id]; // O.
const int16_t horizontal_diff =
dgd[dgd_id + 1] + dgd[dgd_id - 1] - base_value; // H.
int16_t vertical_diff = dgd[prev_row] - base_value; // V.
int16_t anti_diagonal_diff = dgd[prev_row + 1] - base_value; // A.
int16_t diagonal_diff = dgd[prev_row - 1] - base_value; // D.
vertical_diff += dgd[next_row];
anti_diagonal_diff += dgd[next_row - 1];
diagonal_diff += dgd[next_row + 1];
feature_line_buffers[buffer_row_0][buffer_col] =
abs(horizontal_diff); // fo
feature_line_buffers[buffer_row_1][buffer_col] = abs(vertical_diff); // f1
feature_line_buffers[buffer_row_2][buffer_col] =
abs(anti_diagonal_diff); // f2
feature_line_buffers[buffer_row_3][buffer_col] = abs(diagonal_diff); // f3
}
buffer_col = 0;
#pragma GCC ivdep
for (int col = col_begin; col < col_end; ++col, ++buffer_col) {
feature_sum_buffers[0][buffer_col] +=
feature_line_buffers[buffer_row_0][buffer_col];
feature_sum_buffers[1][buffer_col] +=
feature_line_buffers[buffer_row_1][buffer_col];
feature_sum_buffers[2][buffer_col] +=
feature_line_buffers[buffer_row_2][buffer_col];
feature_sum_buffers[3][buffer_col] +=
feature_line_buffers[buffer_row_3][buffer_col];
}
}
static void allocate_pcwiener_line_buffers(int procunit_width) {
buffer_width = procunit_width + MAX_FEATURE_LENGTH - 1;
for (int j = 0; j < NUM_FEATURE_LINE_BUFFERS; ++j) {
// This should be done only once.
feature_line_buffers[j] = (int16_t *)(aom_malloc(
buffer_width * sizeof(*feature_line_buffers[j])));
}
for (int j = 0; j < NUM_PC_WIENER_FEATURES; ++j) {
// This should be done only once.
feature_sum_buffers[j] =
(int *)(aom_malloc(buffer_width * sizeof(*feature_sum_buffers[j])));
}
tskip_sum_buffer =
(int16_t *)(aom_malloc(buffer_width * sizeof(*tskip_sum_buffer)));
}
static void free_pcwiener_line_buffers() {
for (int j = 0; j < NUM_FEATURE_LINE_BUFFERS; ++j) {
aom_free(feature_line_buffers[j]);
feature_line_buffers[j] = NULL;
}
for (int j = 0; j < NUM_PC_WIENER_FEATURES; ++j) {
aom_free(feature_sum_buffers[j]);
feature_sum_buffers[j] = NULL;
}
aom_free(tskip_sum_buffer);
tskip_sum_buffer = NULL;
buffer_width = 0;
}
static void clear_line_buffers() {
for (int k = 0; k < NUM_FEATURE_LINE_BUFFERS; ++k)
memset(feature_line_buffers[k], 0,
sizeof(*feature_line_buffers[k]) * buffer_width);
for (int k = 0; k < NUM_PC_WIENER_FEATURES; ++k)
memset(feature_sum_buffers[k], 0,
sizeof(*feature_sum_buffers[k]) * buffer_width);
memset(tskip_sum_buffer, 0, sizeof(*tskip_sum_buffer) * buffer_width);
}
// Accumulates tskip over a window of rows centered at row. If use_strict_bounds
// is false tskip must have valid data on this column extending for rows from
// [row_begin, row_end) where,
// row_begin = row - PC_WIENER_TSKIP_LENGTH / 2
// row_end = row + PC_WIENER_TSKIP_LENGTH / 2 + 1.
// This version of the routine assumes use_strict_bounds is true.
static void fill_tskip_sum_buffer(int row, const uint8_t *tskip,
int tskip_stride, int width, int height,
int tskip_length, bool use_strict_bounds) {
// TODO(oguleryuz): tskip needs boundary extension.
assert(use_strict_bounds == true);
const int col_begin = -tskip_length / 2;
const int col_end = width + tskip_length / 2;
const int clamped_row = AOMMAX(AOMMIN(row, height - 1), 0);
int buffer_col = 0;
int left_tskip_id =
(clamped_row >> MI_SIZE_LOG2) * tskip_stride + (0 >> MI_SIZE_LOG2);
for (int col = col_begin; col < 0; ++col) {
tskip_sum_buffer[buffer_col] += tskip[left_tskip_id];
++buffer_col;
}
int tskip_id_base = (clamped_row >> MI_SIZE_LOG2) * tskip_stride;
#pragma GCC ivdep
for (int col = 0; col < (width >> MI_SIZE_LOG2); ++col) {
const uint8_t tskip_val = tskip[tskip_id_base + col];
for (int i = 0; i < (1 << MI_SIZE_LOG2); ++i) {
tskip_sum_buffer[buffer_col] += tskip_val;
++buffer_col;
}
}
for (int col = (width >> MI_SIZE_LOG2) << MI_SIZE_LOG2; col < width; ++col) {
int tskip_id =
(clamped_row >> MI_SIZE_LOG2) * tskip_stride + (col >> MI_SIZE_LOG2);
tskip_sum_buffer[buffer_col] += tskip[tskip_id];
++buffer_col;
}
int right_tskip_id = (clamped_row >> MI_SIZE_LOG2) * tskip_stride +
((width - 1) >> MI_SIZE_LOG2);
for (int col = width; col < col_end; ++col) {
tskip_sum_buffer[buffer_col] += tskip[right_tskip_id];
++buffer_col;
}
int subtract_row = row - tskip_length;
if (subtract_row >= -tskip_length / 2) {
assert(subtract_row <= height - 1);
subtract_row = subtract_row >= 0 ? subtract_row : 0;
buffer_col = 0;
left_tskip_id =
(subtract_row >> MI_SIZE_LOG2) * tskip_stride + (0 >> MI_SIZE_LOG2);
for (int col = col_begin; col < 0; ++col) {
tskip_sum_buffer[buffer_col] -= tskip[left_tskip_id];
++buffer_col;
}
tskip_id_base = (subtract_row >> MI_SIZE_LOG2) * tskip_stride;
#pragma GCC ivdep
for (int col = 0; col < (width >> MI_SIZE_LOG2); ++col) {
const uint8_t tskip_val = tskip[tskip_id_base + col];
for (int i = 0; i < (1 << MI_SIZE_LOG2); ++i) {
tskip_sum_buffer[buffer_col] -= tskip_val;
++buffer_col;
}
}
for (int col = (width >> MI_SIZE_LOG2) << MI_SIZE_LOG2; col < width;
++col) {
int tskip_id =
(subtract_row >> MI_SIZE_LOG2) * tskip_stride + (col >> MI_SIZE_LOG2);
tskip_sum_buffer[buffer_col] -= tskip[tskip_id];
++buffer_col;
}
right_tskip_id = (subtract_row >> MI_SIZE_LOG2) * tskip_stride +
((width - 1) >> MI_SIZE_LOG2);
for (int col = width; col < col_end; ++col) {
tskip_sum_buffer[buffer_col] -= tskip[right_tskip_id];
++buffer_col;
}
}
}
static void fill_directional_feature_accumulators(int col, int col_offset,
int feature_length) {
const int col_base = col + col_offset + feature_length / 2;
const int cl = col_base - feature_length;
// Lose the if check.
if (cl < 0) {
const int cr = col_base;
for (int k = 0; k < NUM_PC_WIENER_FEATURES; k++) {
directional_feature_accumulator[k] += feature_sum_buffers[k][cr];
}
return;
}
const int cr = col_base;
for (int k = 0; k < NUM_PC_WIENER_FEATURES; ++k) {
directional_feature_accumulator[k] +=
feature_sum_buffers[k][cr] - feature_sum_buffers[k][cl];
}
}
static void fill_tskip_feature_accumulator(int col, int col_offset,
int tskip_length) {
const int col_base = col + col_offset + tskip_length / 2;
const int cl = col_base - tskip_length;
// Lose the if check.
if (cl < 0) {
const int cr = col_base;
tskip_feature_accumulator += tskip_sum_buffer[cr];
return;
}
const int cr = col_base;
tskip_feature_accumulator += tskip_sum_buffer[cr] - tskip_sum_buffer[cl];
}
// Initializes the accumulators.
static void initialize_feature_accumulators(int feature_length,
int tskip_length) {
for (int k = 0; k < NUM_PC_WIENER_FEATURES; ++k) {
directional_feature_accumulator[k] = 0;
}
tskip_feature_accumulator = 0;
const int feature_half_length = feature_length / 2;
const int tskip_half_length = tskip_length / 2;
// Initialize accumulators on the leftmost portion of the line.
for (int col_offset = -feature_half_length; col_offset < feature_half_length;
++col_offset) {
fill_directional_feature_accumulators(0, col_offset, feature_length);
}
for (int col_offset = -tskip_half_length; col_offset < tskip_half_length;
++col_offset) {
fill_tskip_feature_accumulator(0, col_offset, tskip_length);
}
}
// Updates the accumulators.
static void update_accumulators(int col, int feature_length,
int feature_half_length, int tskip_length,
int tskip_half_length) {
fill_directional_feature_accumulators(col, feature_half_length,
feature_length);
fill_tskip_feature_accumulator(col, tskip_half_length, tskip_length);
}
// Calculates the features needed for get_pcwiener_index.
static void calculate_features(int32_t *feature_vector, int bit_depth) {
for (int f = 0; f < NUM_PC_WIENER_FEATURES; ++f) {
feature_vector[f] =
directional_feature_accumulator[f] * feature_normalizers[f];
}
const int bit_depth_shift = bit_depth - 8;
if (bit_depth_shift) {
for (int f = 0; f < NUM_PC_WIENER_FEATURES; ++f)
feature_vector[f] =
ROUND_POWER_OF_TWO_SIGNED(feature_vector[f], bit_depth_shift);
}
const int tskip_index = NUM_PC_WIENER_FEATURES;
feature_vector[tskip_index] =
tskip_feature_accumulator * feature_normalizers[tskip_index];
}
// Lookup table useful in calculating the filter indices within
// get_pcwiener_index().
static int qval_given_tskip_lut[256][NUM_PC_WIENER_FEATURES] = { 0 };
static void fill_qval_given_tskip_lut(int base_qindex, int bit_depth) {
int qstep_shift = 0;
int qstep = get_qstep(base_qindex, bit_depth, &qstep_shift);
qstep_shift += 8; // normalization in tf
const int bit_depth_shift = bit_depth - 8;
if (bit_depth_shift) {
qstep = ROUND_POWER_OF_TWO_SIGNED(qstep, bit_depth_shift);
qstep_shift -= bit_depth_shift;
}
// actual * 256
const int tskip_shift = 8;
const int diff_shift = qstep_shift - tskip_shift;
assert(diff_shift >= 0);
for (int tskip = 0; tskip < 255; ++tskip) {
const int tskip_shifted = tskip * (1 << diff_shift);
const int tskip_qstep_prod =
ROUND_POWER_OF_TWO_SIGNED(tskip * qstep, tskip_shift);
const int total_shift = qstep_shift;
// Arithmetic ideas: tskip can be divided by 2, qstep can be scaled down.
for (int i = 0; i < NUM_PC_WIENER_FEATURES; ++i) {
int32_t qval = (mode_weights[i][0] * tskip_shifted) +
(mode_weights[i][1] * qstep) +
(mode_weights[i][2] * tskip_qstep_prod);
qval = ROUND_POWER_OF_TWO_SIGNED(qval, total_shift);
qval += mode_offsets[i]; // actual * (1 << PC_WIENER_PREC_FEATURE)
qval_given_tskip_lut[tskip][i] = 255 * qval;
}
}
}
static void set_feature_normalizers(bool is_uv) {
if (is_uv) {
for (int i = 0; i < NUM_PC_WIENER_FEATURES; ++i)
feature_normalizers[i] = feature_normalizers_chroma[i];
// Normalizer for tskip.
feature_normalizers[NUM_PC_WIENER_FEATURES] =
(int)(256 / NUM_PC_WIENER_TSKIP_TAPS_CHROMA);
} else {
for (int i = 0; i < NUM_PC_WIENER_FEATURES; ++i)
feature_normalizers[i] = feature_normalizers_luma[i];
feature_normalizers[NUM_PC_WIENER_FEATURES] =
(int)(256 / NUM_PC_WIENER_TSKIP_TAPS_LUMA);
}
}
static int get_pcwiener_index(int bit_depth, int32_t *multiplier) {
int32_t feature_vector[NUM_PC_WIENER_FEATURES + 1]; // 255 x actual
// Fill the feature vector.
calculate_features(feature_vector, bit_depth);
// actual * 256
const int tskip_index = NUM_PC_WIENER_FEATURES;
const int tskip = feature_vector[tskip_index];
assert(tskip < 256);
for (int i = 0; i < NUM_PC_WIENER_FEATURES; ++i)
assert(feature_vector[i] >= 0);
for (int i = 0; i < NUM_PC_WIENER_FEATURES; ++i) {
int32_t qval = ROUND_POWER_OF_TWO_SIGNED(
feature_vector[i] + qval_given_tskip_lut[tskip][i],
PC_WIENER_PREC_FEATURE);
// qval range is [0, 1] -> [0, 255]
feature_vector[i] = clip_pixel(qval) >> pc_wiener_threshold_shift;
}
int lut_input = 0;
for (int i = 0; i < NUM_PC_WIENER_FEATURES; ++i) {
lut_input += pc_wiener_thresholds[i] * feature_vector[i];
}
*multiplier = 1 << PC_WIENER_PREC_FEATURE;
assert(lut_input == AOMMAX(AOMMIN(lut_input, PC_WIENER_LUT_SIZE - 1), 0));
const int filter_index = pc_wiener_lut_to_filter_index[lut_input];
assert(filter_index ==
AOMMAX(AOMMIN(filter_index, NUM_PC_WIENER_FILTERS - 1), 0));
return filter_index;
}
void apply_pc_wiener_highbd(const uint8_t *dgd8, int width, int height,
int stride, uint8_t *dst8, int dst_stride,
const uint8_t *tskip, int tskip_stride,
#if CONFIG_COMBINE_PC_NS_WIENER
const int16_t *nsfilter,
#endif // CONFIG_COMBINE_PC_NS_WIENER
bool is_uv, int bit_depth) {
const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
if (!PC_WIENER_PROCESS_CHROMA && is_uv) {
// Not filtering uv.
for (int i = 0; i < height; ++i) {
for (int j = 0; j < width; ++j) {
dst[i * dst_stride + j] = dgd[i * stride + j];
}
}
return;
}
const int pc_filter_num_taps = is_uv
? sizeof(pcwiener_tap_config_chroma) /
sizeof(pcwiener_tap_config_chroma[0])
: sizeof(pcwiener_tap_config_luma) /
sizeof(pcwiener_tap_config_luma[0]);
const NonsepFilterConfig pcfilter_config = { PC_WIENER_PREC_FILTER,
pc_filter_num_taps,
0,
is_uv
? pcwiener_tap_config_chroma
: pcwiener_tap_config_luma,
NULL,
0,
0 };
const NonsepFilterConfig *filter_config = &pcfilter_config;
bool multiply_here = true;
int32_t correction_factor = 0;
#if CONFIG_COMBINE_PC_NS_WIENER
const WienernsFilterConfigPairType *wnsf = get_wienerns_filters(base_qindex);
const NonsepFilterConfig *nsfilter_config =
is_uv ? &wnsf->uv->nsfilter : &wnsf->y->nsfilter;
assert(nsfilter_config->strict_bounds == false);
if (nsfilter != NULL) {
multiply_here = false;
filter_config = &combined_filter_config;
set_combined_filter_tap_positions(nsfilter_config, &pcfilter_config);
}
#endif // CONFIG_COMBINE_PC_NS_WIENER
const int singleton_tap_index =
filter_config->config[filter_config->num_pixels - 1][NONSEP_BUF_POS];
#if CONFIG_COMBINE_PC_NS_WIENER
const int num_sym_taps = (filter_config->num_pixels - 1) / 2;
#else
const int num_sym_taps = is_uv ? (2 * NUM_PC_WIENER_TAPS_CHROMA - 1) / 2
: (2 * NUM_PC_WIENER_TAPS_LUMA - 1) / 2;
#endif // CONFIG_COMBINE_PC_NS_WIENER
assert(num_sym_taps == (filter_config->num_pixels - 1) / 2);
assert(num_sym_taps <= 24);
int16_t compute_buffer[24];
int pixel_offset_diffs[24];
int filter_pos[24];
for (int k = 0; k < num_sym_taps; ++k) {
const int r = filter_config->config[2 * k][NONSEP_ROW_ID];
const int c = filter_config->config[2 * k][NONSEP_COL_ID];
const int diff = r * stride + c;
pixel_offset_diffs[k] = diff;
filter_pos[k] = filter_config->config[2 * k][NONSEP_BUF_POS];
}
int16_t max_pixel_value = 255;
switch (bit_depth) {
case 10: max_pixel_value = 1023; break;
case 12: max_pixel_value = 4095; break;
}
assert(filter_config->strict_bounds == false);
const bool dir_strict = filter_config->strict_bounds;
const bool tskip_strict = true;
const int feature_length =
is_uv ? PC_WIENER_FEATURE_LENGTH_CHROMA : PC_WIENER_FEATURE_LENGTH_LUMA;
const int feature_half_length = feature_length / 2;
const int tskip_length =
is_uv ? PC_WIENER_TSKIP_LENGTH_CHROMA : PC_WIENER_TSKIP_LENGTH_LUMA;
const int tskip_half_length = tskip_length / 2;
set_feature_normalizers(is_uv);
clear_line_buffers();
for (int row = 0; row < feature_length - 1; ++row) {
fill_directional_feature_buffers_highbd(row - feature_half_length, row, dgd,
stride, width, feature_length,
dir_strict);
}
for (int row = 0; row < tskip_length - 1; ++row) {
fill_tskip_sum_buffer(row - tskip_half_length, tskip, tskip_stride, width,
height, tskip_length, tskip_strict);
}
for (int i = 0; i < height; ++i) {
fill_directional_feature_buffers_highbd(i + feature_half_length,
feature_length - 1, dgd, stride,
width, feature_length, dir_strict);
fill_tskip_sum_buffer(i + tskip_half_length, tskip, tskip_stride, width,
height, tskip_length, tskip_strict);
#if PC_WIENER_BLOCK_SIZE > 1
bool skip_row_compute =
i % PC_WIENER_BLOCK_SIZE != PC_WIENER_BLOCK_ROW_OFFSET;
#endif // PC_WIENER_BLOCK_SIZE > 1
// Initialize accumulators on the leftmost portion of the line.
if (!skip_row_compute) {
initialize_feature_accumulators(feature_length, tskip_length);
}
for (int j = 0; j < width; ++j) {
if (!skip_row_compute) {
update_accumulators(j, feature_length, feature_half_length,
tskip_length, tskip_half_length);
}
#if PC_WIENER_BLOCK_SIZE > 1
if (skip_row_compute ||
j % PC_WIENER_BLOCK_SIZE != PC_WIENER_BLOCK_COL_OFFSET)
continue;
#endif // PC_WIENER_BLOCK_SIZE > 1
int32_t multiplier = 0;
const int filter_index = get_pcwiener_index(bit_depth, &multiplier);
const int16_t *filter = is_uv ? pcwiener_filters_chroma[filter_index]
: pcwiener_filters_luma[filter_index];
const int16_t singleton_tap =
filter[singleton_tap_index] + (1 << filter_config->prec_bits);
#if CONFIG_COMBINE_PC_NS_WIENER
if (nsfilter != NULL) {
add_filters(nsfilter_config, &pcfilter_config, nsfilter, filter,
1 << PC_WIENER_PREC_FEATURE, multiplier);
filter = combined_filter;
correction_factor = combined_filter_correction;
}
#endif // CONFIG_COMBINE_PC_NS_WIENER
#if PC_WIENER_BLOCK_SIZE > 1
const int block_row_begin = i - PC_WIENER_BLOCK_ROW_OFFSET;
int block_row_end =
AOMMIN(block_row_begin + PC_WIENER_BLOCK_SIZE, height);
if (i + PC_WIENER_BLOCK_SIZE >= height) block_row_end = height;
const int block_col_begin = j - PC_WIENER_BLOCK_COL_OFFSET;
int block_col_end = AOMMIN(block_col_begin + PC_WIENER_BLOCK_SIZE, width);
if (j + PC_WIENER_BLOCK_SIZE >= width) block_col_end = width;
#else
const int block_row_begin = i;
const int block_row_end = i + 1;
const int block_col_begin = j;
const int block_col_end = j + 1;
#endif // PC_WIENER_BLOCK_SIZE > 1
#if USE_CONVOLVE_SYM
av1_convolve_symmetric_highbd_c(
dgd, stride, filter_config, filter, dst, dst_stride, bit_depth,
block_row_begin, block_row_end, block_col_begin, block_col_end);
#else
for (int r = block_row_begin; r < block_row_end; ++r) {
for (int c = block_col_begin; c < block_col_end; ++c) {
int dgd_id = r * stride + c;
// Two loops for a potential data cache miss.
for (int k = 0; k < num_sym_taps; ++k) {
const int diff = pixel_offset_diffs[k];
const int16_t tmp_sum = dgd[dgd_id - diff];
compute_buffer[k] = tmp_sum;
}
for (int k = 0; k < num_sym_taps; ++k) {
const int diff = pixel_offset_diffs[k];
const int16_t tmp_sum = dgd[dgd_id + diff];
compute_buffer[k] += tmp_sum;
}
// Handle singleton tap.
int32_t tmp = singleton_tap * dgd[dgd_id];
for (int k = 0; k < num_sym_taps; ++k) {
const int pos = filter_pos[k];
tmp += filter[pos] * compute_buffer[k];
}
if (multiply_here) {
tmp = ROUND_POWER_OF_TWO_SIGNED(tmp, filter_config->prec_bits);
} else {
// TODO(oguleryuz): Change pc training so that both filters operate
// the same way and a correction is not needed.
tmp -= correction_factor * dgd[dgd_id];
tmp = ROUND_POWER_OF_TWO_SIGNED(tmp, filter_config->prec_bits);
}
int dst_id = r * dst_stride + c;
dst[dst_id] = (tmp > max_pixel_value) ? max_pixel_value
: (tmp < 0) ? 0
: tmp;
}
}
#endif // USE_CONVOLVE_SYM
}
rotate_feature_line_buffers(feature_length);
}
}
static void pc_wiener_stripe_highbd(const RestorationUnitInfo *rui,
int stripe_width, int stripe_height,
int procunit_width, const uint8_t *src,
int src_stride, uint8_t *dst,
int dst_stride, int32_t *tmpbuf,
int bit_depth) {
(void)tmpbuf;
(void)bit_depth;
assert(rui->tskip);
bool is_uv = (rui->plane != AOM_PLANE_Y);
static int prev_qindex = -1;
static int prev_bit_depth = -1;
if (rui->base_qindex + rui->qindex_offset != prev_qindex ||
bit_depth != prev_bit_depth) {
fill_qval_given_tskip_lut(rui->base_qindex + rui->qindex_offset, bit_depth);
prev_qindex = rui->base_qindex + rui->qindex_offset;
prev_bit_depth = bit_depth;
}
for (int j = 0; j < stripe_width; j += procunit_width) {
int w = AOMMIN(procunit_width, stripe_width - j);
apply_pc_wiener_highbd(src + j, w, stripe_height, src_stride, dst + j,
dst_stride, rui->tskip + (j >> MI_SIZE_LOG2),
rui->tskip_stride,
#if CONFIG_COMBINE_PC_NS_WIENER
NULL,
#endif // CONFIG_COMBINE_PC_NS_WIENER
is_uv, bit_depth);
}
}
#endif // CONFIG_PC_WIENER
#if CONFIG_WIENER_NONSEP
void apply_wiener_nonsep_highbd(const uint8_t *dgd8, int width, int height,
int stride, int base_qindex,
const int16_t *filter, uint8_t *dst8,
int dst_stride, int plane, const uint8_t *luma8,
int luma_stride, int bit_depth) {
(void)luma8;
(void)luma_stride;
int is_uv = (plane != AOM_PLANE_Y);
const WienernsFilterConfigPairType *wnsf = get_wienerns_filters(base_qindex);
const NonsepFilterConfig *nsfilter =
is_uv ? &wnsf->uv->nsfilter : &wnsf->y->nsfilter;
const int16_t *filter_ = is_uv ? filter + wnsf->y->ncoeffs : filter;