blob: 09bf366f0b6f2d3cb745be40de1657576ca1c925 [file] [log] [blame]
/*
* Copyright (c) 2021, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 3-Clause Clear License
* and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
* License was not distributed with this source code in the LICENSE file, you
* can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
* Alliance for Open Media Patent License 1.0 was not distributed with this
* source code in the PATENTS file, you can obtain it at
* aomedia.org/license/patent-license/.
*/
#include <immintrin.h>
#include <assert.h>
#include "config/av1_rtcd.h"
#include "av1/common/convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_dsp/x86/synonyms_avx2.h"
// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
// on the left.
// A row of, say, 16-bit pixels with values p0, p1, p2, ..., p14, p15 will be
// loaded and stored as [ p15 ... p9 p8 ][ p7 ... p1 p0 ].
void av1_highbd_wiener_convolve_add_src_avx2(
const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w, int h,
const ConvolveParams *conv_params, int bd) {
assert(x_step_q4 == 16 && y_step_q4 == 16);
assert(!(w & 7));
assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
(void)x_step_q4;
(void)y_step_q4;
const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
DECLARE_ALIGNED(32, uint16_t,
temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
int intermediate_height = h + SUBPEL_TAPS - 1;
const int center_tap = ((SUBPEL_TAPS - 1) / 2);
const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
const __m128i zero_128 = _mm_setzero_si128();
const __m256i zero_256 = _mm256_setzero_si256();
// Add an offset to account for the "add_src" part of the convolve function.
const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
const __m256i clamp_low = zero_256;
/* Horizontal filter */
{
const __m256i clamp_high_ep =
_mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
// coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
// coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
// coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
// coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
// coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
// coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
// coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
// coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
// coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
// coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
// coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
const __m256i round_const = _mm256_set1_epi32(
(1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
for (int i = 0; i < intermediate_height; ++i) {
for (int j = 0; j < w; j += 16) {
const uint16_t *src_ij = src_ptr + i * src_stride + j;
// Load 16-bit src data
const __m256i src_0 = yy_loadu_256(src_ij + 0);
const __m256i src_1 = yy_loadu_256(src_ij + 1);
const __m256i src_2 = yy_loadu_256(src_ij + 2);
const __m256i src_3 = yy_loadu_256(src_ij + 3);
const __m256i src_4 = yy_loadu_256(src_ij + 4);
const __m256i src_5 = yy_loadu_256(src_ij + 5);
const __m256i src_6 = yy_loadu_256(src_ij + 6);
const __m256i src_7 = yy_loadu_256(src_ij + 7);
// Multiply src data by filter coeffs and sum pairs
const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
// Calculate scalar product for even- and odd-indices separately,
// increasing to 32-bit precision
const __m256i res_even_sum = _mm256_add_epi32(
_mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
const __m256i res_even = _mm256_srai_epi32(
_mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
const __m256i res_odd_sum = _mm256_add_epi32(
_mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
const __m256i res_odd = _mm256_srai_epi32(
_mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
// Reduce to 16-bit precision and pack even- and odd-index results
// back into one register. The _mm256_packs_epi32 intrinsic returns
// a register with the pixels ordered as follows:
// [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
const __m256i res = _mm256_packs_epi32(res_even, res_odd);
const __m256i res_clamped =
_mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep);
// Store in a temporary array
yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
}
}
}
/* Vertical filter */
{
const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1);
// coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
// coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
// coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
// coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
// coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
// coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
// coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
// coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
// coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
// coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
// coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
const __m256i round_const =
_mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
(1 << (bd + conv_params->round_1 - 1)));
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; j += 16) {
const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j;
// Load 16-bit data from the output of the horizontal filter in
// which the pixels are ordered as follows:
// [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE);
const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE);
const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE);
const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE);
const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE);
const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE);
const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE);
const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE);
// Filter the even-indices, increasing to 32-bit precision
const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
const __m256i res_even = _mm256_add_epi32(
_mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
// Filter the odd-indices, increasing to 32-bit precision
const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
const __m256i res_odd = _mm256_add_epi32(
_mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
// Pixels are currently in the following order:
// res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
// res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ]
//
// Rearrange the pixels into the following order:
// res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ]
// res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
const __m256i res_lo_round = _mm256_srai_epi32(
_mm256_add_epi32(res_lo, round_const), conv_params->round_1);
const __m256i res_hi_round = _mm256_srai_epi32(
_mm256_add_epi32(res_hi, round_const), conv_params->round_1);
// Reduce to 16-bit precision and pack into the correct order:
// [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
const __m256i res_16bit =
_mm256_packs_epi32(res_lo_round, res_hi_round);
const __m256i res_16bit_clamped = _mm256_min_epi16(
_mm256_max_epi16(res_16bit, clamp_low), clamp_high);
// Store in the dst array
yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped);
}
}
}
}
// 256bit intrinsic implementation of ROUND_POWER_OF_TWO_SIGNED.
static INLINE __m256i round_power_of_two_signed_avx2(__m256i v_val_d,
int bits) {
const __m256i v_bias_d = _mm256_set1_epi32((1 << bits) >> 1);
const __m256i v_sign_d = _mm256_srai_epi32(v_val_d, 31);
const __m256i v_tmp_d =
_mm256_add_epi32(_mm256_add_epi32(v_val_d, v_bias_d), v_sign_d);
return _mm256_srai_epi32(v_tmp_d, bits);
}
// Clip the pixel values to zero/max.
static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) {
return _mm256_max_epi16(_mm256_min_epi16(u, max), zero);
}
// Downcast 256bit register to 128bit by considering lower 128bit.
#define CAST_LOW(a) _mm256_castsi128_si256(a)
// Load source data required for chroma filtering into registers.
#define LOAD_SRC_DATA_FOR_CHROMA_FILTERING(src, stride) \
const __m128i src_a0 = _mm_loadu_si128((__m128i const *)src); \
const __m128i src_b0 = _mm_loadu_si128((__m128i const *)(src + stride)); \
const __m128i src_c0 = _mm_loadu_si128((__m128i const *)(src + 2 * stride)); \
const __m128i src_d0 = _mm_loadu_si128((__m128i const *)(src + 3 * stride)); \
const __m128i src_e0 = _mm_loadu_si128((__m128i const *)(src + 4 * stride)); \
const __m128i src_f0 = _mm_loadu_si128((__m128i const *)(src + 5 * stride)); \
const __m128i src_g0 = _mm_loadu_si128((__m128i const *)(src + 6 * stride)); \
const __m128i src_h0 = _mm_loadu_si128((__m128i const *)(src + 7 * stride));
// Forms 256bit source registers from loaded 128bit registers and pack them
// accordingly for filtering process.
#define FORM_AND_PACK_REG_FOR_CHROMA_FILTERING() \
/* Form 256bit source registers.*/ \
/* a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 b7*/ \
const __m256i src_ab = _mm256_inserti128_si256(CAST_LOW(src_a0), src_b0, 1); \
/* c0 c1 c2 c3 c4 c5 c6 c7 | d0 d1 d2 d3 d4 d5 d6 d7*/ \
const __m256i src_cd = _mm256_inserti128_si256(CAST_LOW(src_c0), src_d0, 1); \
/* e0 e1 e2 e3 e4 e5 e6 e7 | f0 f1 f2 f3 f4 f5 f6 f7*/ \
const __m256i src_ef = _mm256_inserti128_si256(CAST_LOW(src_e0), src_f0, 1); \
/* g0 g1 g2 g3 g4 g5 g6 g7 | h0 h1 h2 h3 h4 h5 h6 h7*/ \
const __m256i src_gh = _mm256_inserti128_si256(CAST_LOW(src_g0), src_h0, 1); \
/* b0 b1 b2 b3 b4 b5 b6 b7 | c0 c1 c2 c3 c4 c5 c6 c7*/ \
const __m256i src_bc = _mm256_inserti128_si256(CAST_LOW(src_b0), src_c0, 1); \
/* d0 d1 d2 d3 d4 d5 d6 d7 | e0 e1 e2 e3 e4 e5 e6 e7*/ \
const __m256i src_de = _mm256_inserti128_si256(CAST_LOW(src_d0), src_e0, 1); \
/* f0 f1 f2 f3 f4 f5 f6 f7 | g0 g1 g2 g3 g4 g5 g6 g7*/ \
const __m256i src_fg = _mm256_inserti128_si256(CAST_LOW(src_f0), src_g0, 1); \
/* Packing the source rows.*/ \
/* a0 e0 a1 e1 a2 e2 a3 e3 | b0 f0 b1 f1 b2 f2 b3 f3*/ \
const __m256i ru0 = _mm256_unpacklo_epi16(src_ab, src_ef); \
/* a4 e4 a5 e5 a6 e6 a7 e7 | b4 f4 b5 f5 b6 f6 b7 f7*/ \
const __m256i ru1 = _mm256_unpackhi_epi16(src_ab, src_ef); \
/* c0 g0 c1 g1 c2 g2 c3 g3 | d0 h0 d1 h1 d2 h2 d3 h3*/ \
const __m256i ru2 = _mm256_unpacklo_epi16(src_cd, src_gh); \
/* c4 g4 c5 g5 c6 g6 c7 g7 | d4 h4 d5 h5 d6 h6 d7 h7*/ \
const __m256i ru3 = _mm256_unpackhi_epi16(src_cd, src_gh); \
/* b0 d0 b1 d1 b2 d2 b3 d3 | c0 e0 c1 e1 c2 e2 c3 e3*/ \
const __m256i ru4 = _mm256_unpacklo_epi16(src_bc, src_de); \
/* b4 d4 b5 d5 b6 d6 b7 d7 | c4 e4 c5 e5 c6 e6 c7 e7*/ \
const __m256i ru5 = _mm256_unpackhi_epi16(src_bc, src_de); \
/* d0 f0 d1 f1 d2 f2 d3 f3 | e0 g0 e1 g1 e2 g2 e3 g3*/ \
const __m256i ru6 = _mm256_unpacklo_epi16(src_de, src_fg); \
/* d4 f4 d5 f5 d6 f6 d7 f7 | e4 g4 e5 g5 e6 g6 e7 g7*/ \
const __m256i ru7 = _mm256_unpackhi_epi16(src_de, src_fg);
// Multiply the formed registers with filter and add the result to accumulate
// registers.
#define MADD_AND_ACCUM_FOR_CHROMA_FILTERING(f0, f1, f2) \
{ \
const __m256i res_0 = _mm256_madd_epi16(ru10, f0); \
const __m256i res_1 = _mm256_madd_epi16(ru11, f0); \
const __m256i res_2 = _mm256_madd_epi16(ru12, f1); \
const __m256i res_3 = _mm256_madd_epi16(ru13, f1); \
const __m256i res_4 = _mm256_madd_epi16(ru14, f2); \
const __m256i res_5 = _mm256_madd_epi16(ru15, f2); \
/* r00 r01 r02 r03 | r10 r11 r12 r13 */ \
const __m256i out_0 = _mm256_add_epi32(res_0, res_2); \
const __m256i out_1 = _mm256_add_epi32(res_4, out_0); \
*accum_out_r0r1 = _mm256_add_epi32(out_1, *accum_out_r0r1); \
/* r20 r21 r22 r23 | r30 r31 r32 r33 */ \
const __m256i out_2 = _mm256_add_epi32(res_1, res_3); \
const __m256i out_3 = _mm256_add_epi32(res_5, out_2); \
*accum_out_r2r3 = _mm256_add_epi32(out_3, *accum_out_r2r3); \
}
// Implementation of DIAMOND shaped 7-tap filtering for block size of 4x4.
// The output for a particular pixel in a 4x4 block is calculated by considering
// a 5x5 grid surrounded by that pixel. The registers accum_out_r0r1 and
// accum_out_r2r3 are used to store the output. The following describes the
// algorithm briefly.
// Filter Coefficients: fc0 fc1 fc2 fc3 fc4 fc5 fc6 x
// Load Source Data:
// src_ra0 = a0 a1 a2 a3 a4 a5 a6 a7
// src_rb0 = b0 b1 b2 b3 b4 b5 b6 b7
// src_rc0 = c0 c1 c2 c3 c4 c5 c6 c7
// src_rd0 = d0 d1 d2 d3 d4 d5 d6 d7
// src_re0 = e0 e1 e2 e3 e4 e5 e6 e7
// src_rf0 = f0 f1 f2 f3 f4 f5 f6 f7
// src_rg0 = g0 g1 g2 g3 g4 g5 g6 g7
// The output for a pixel located at c2 position is calculated as below.
// Filtered_c2 = (a2+e2)*fc4 + (b1+d3)*fc2 + (b2+d2)*fc0 + (b3+d1)*fc3 +
// (c0+c4)*fc5 + (c1+c3)*fc1 + c2*fc6
// The source registers are unpacked such that the output corresponding to 2
// rows will be produced in a single register (i.e., processing 2 rows
// simultaneously).
//
// Example:
// The output corresponding to fc4 of rows 0 and 1 is achieved like below.
// __m256i src_reg3 = a2 e2 a3 e3 a4 e4 a5 e5 | b2 f2 b3 f3 b4 f4 b5 f5
// __m256i filter_4 = fc4 fc4 fc4 fc4 fc4 fc4 fc4 fc4 | fc4 fc4 fc4 fc4 fc4 fc4
// fc4 fc4
// __m256i out_f4_01 = _mm256_madd_epi16(src_reg3, filter_4);
// = (a2*fc4+e2*fc4) (a3*fc4+e3*fc4) .. | (b2*fc4+f2*fc4) . .
// Here, out_f4_01 contains partial output of rows 0 and 1 corresponding to fc4.
static INLINE void apply_7tap_filtering_with_subtract_center_off(
const uint16_t *dgd, int stride, const __m128i filt_coeff,
__m256i *accum_out_r0r1, __m256i *accum_out_r2r3, int block_row_begin,
int block_col_begin) {
// Load source data
const int src_index_start = block_row_begin * stride + block_col_begin;
const uint16_t *src_ptr = dgd + src_index_start - 2 * stride - 2;
// Load source data required for chroma filtering into registers.
LOAD_SRC_DATA_FOR_CHROMA_FILTERING(src_ptr, stride)
// Forms 256bit source registers from loaded 128bit registers and pack them
// accordingly for filtering process.
FORM_AND_PACK_REG_FOR_CHROMA_FILTERING()
// Output corresponding to filter coefficient f4.
// a2 e2 a3 e3 a4 e4 a5 e5 | b2 f2 b3 f3 b4 f4 b5 f5
const __m256i ru8 = _mm256_alignr_epi8(ru1, ru0, 8);
// c2 g2 c3 g3 c4 g4 c5 g5 | d2 h2 d3 h3 d4 h4 d5 h5
const __m256i ru9 = _mm256_alignr_epi8(ru3, ru2, 8);
// f4 f4 f4 f4 f4 f4 f4 f4 f4 f4 f4 f4 f4 f4 f4 f4 f4 f4
const __m256i fc4 =
_mm256_broadcastd_epi32(_mm_unpackhi_epi16(filt_coeff, filt_coeff));
// r00 r01 r02 r03 | r10 r11 r12 r13
const __m256i out_f4_r0r1 = _mm256_madd_epi16(ru8, fc4);
*accum_out_r0r1 = _mm256_add_epi32(out_f4_r0r1, *accum_out_r0r1);
// r20 r21 r22 r23 | r30 r31 r32 r33
const __m256i out_f4_r2r3 = _mm256_madd_epi16(ru9, fc4);
*accum_out_r2r3 = _mm256_add_epi32(out_f4_r2r3, *accum_out_r2r3);
// Output corresponding to filter coefficient 2, 0, 3.
// b1 d1 b2 d2 b3 d3 b4 d4 | c1 e1 c2 e2 c3 e3 c4 e4
__m256i ru10 = _mm256_alignr_epi8(ru5, ru4, 4);
// d1 f1 d2 f2 d3 f3 d4 f4 | e1 g1 e2 g2 e3 g3 e4 g4
__m256i ru11 = _mm256_alignr_epi8(ru7, ru6, 4);
// b2 d2 b3 d3 b4 d4 b5 d5 | c2 e2 c3 e3 c4 e4 c5 e5
__m256i ru12 = _mm256_alignr_epi8(ru5, ru4, 8);
// d2 f2 d3 f3 d4 f4 d5 f5 | e2 g2 e3 g3 e4 g4 e5 g5
__m256i ru13 = _mm256_alignr_epi8(ru7, ru6, 8);
// b3 d3 b4 d4 b5 d5 b6 d6 | c3 e3 c4 e4 c5 e5 c6 e6
__m256i ru14 = _mm256_alignr_epi8(ru5, ru4, 12);
// d3 f3 d4 f4 d5 f5 d6 f6 | e3 g3 e4 g4 e5 g5 e6 g6
__m256i ru15 = _mm256_alignr_epi8(ru7, ru6, 12);
// f2 f3 f2 f3 - - - -
const __m256i fc23 =
_mm256_broadcastd_epi32(_mm_shufflelo_epi16(filt_coeff, 0x0E));
// f0 f0 f0 f0 - - - -
const __m256i fc00 = _mm256_broadcastw_epi16(filt_coeff);
// f3 f2 f3 f2 - - - -
const __m256i fc32 =
_mm256_broadcastd_epi32(_mm_shufflelo_epi16(filt_coeff, 0x0B));
// Multiply the formed registers with filter and add the result to accumulate
// registers.
MADD_AND_ACCUM_FOR_CHROMA_FILTERING(fc23, fc00, fc32)
// Output corresponding to filter coefficient 5, 1, 6.
const __m256i zero = _mm256_set1_epi16(0x0);
// c0 c1 c1 c2 c2 c3 c3 c4 || d0 d1 d1 d2 d2 d3 d3 d4
ru10 = _mm256_unpacklo_epi16(src_cd, _mm256_bsrli_epi128(src_cd, 2));
// e0 e1 e1 e2 e2 e3 e3 e4 || f0 f1 f1 f2 f2 f3 f3 f4
ru11 = _mm256_unpacklo_epi16(src_ef, _mm256_bsrli_epi128(src_ef, 2));
// c2 c3 c3 c4 c4 c5 c5 c6 || d2 d3 d3 d4 d4 d5 d5 d6
ru12 = _mm256_unpacklo_epi16(_mm256_bsrli_epi128(src_cd, 4),
_mm256_bsrli_epi128(src_cd, 6));
// e2 e3 e3 e4 e4 e5 e5 e6 || f2 f3 f3 f4 f4 f5 f5 f6
ru13 = _mm256_unpacklo_epi16(_mm256_bsrli_epi128(src_ef, 4),
_mm256_bsrli_epi128(src_ef, 6));
// c4 0 c5 0 c6 0 c7 0 || d4 0 d5 0 d6 0 d7 0
ru14 = _mm256_unpackhi_epi16(src_cd, zero);
// e4 0 e5 0 e6 0 e7 0 || f4 0 f5 0 f6 0 f7 0
ru15 = _mm256_unpackhi_epi16(src_ef, zero);
// f5 f1 f5 f1 - - - -
const __m128i filt51 =
_mm_blend_epi16(filt_coeff, _mm_bsrli_si128(filt_coeff, 4), 0x08);
const __m256i fc51 =
_mm256_broadcastd_epi32(_mm_shufflelo_epi16(filt51, 0x07));
// f6 f1 f6 f1 - - - -
const __m128i filt61 =
_mm_blend_epi16(filt_coeff, _mm_bsrli_si128(filt_coeff, 6), 0x08);
const __m256i fc61 =
_mm256_broadcastd_epi32(_mm_shufflelo_epi16(filt61, 0x07));
// f5 0 f5 0 f5 0 f5 0 - -
const __m256i fc5z = _mm256_blend_epi16(fc51, zero, 0xAA);
// Multiply the formed registers with filter and add the result to accumulate
// registers.
MADD_AND_ACCUM_FOR_CHROMA_FILTERING(fc51, fc61, fc5z)
}
// The registers accum_out_r0r1 and accum_out_r2r3 holds the filtered output.
// This function performs round, clip operations on filtered output before
// storing it to the destination.
static INLINE void round_and_store_avx2(const uint16_t *dst, int dst_stride,
const NonsepFilterConfig *filter_config,
int bit_depth, __m256i accum_out_r0r1,
__m256i accum_out_r2r3,
int block_row_begin,
int block_col_begin) {
// Rounding and clipping.
accum_out_r0r1 =
round_power_of_two_signed_avx2(accum_out_r0r1, filter_config->prec_bits);
accum_out_r2r3 =
round_power_of_two_signed_avx2(accum_out_r2r3, filter_config->prec_bits);
// r00 r01 r02 r03 | r20 r21 r22 r23 | r10 r11 r12 r13 | r30 r31 r32 r33
__m256i out_r0r2r1r3 = _mm256_packs_epi32(accum_out_r0r1, accum_out_r2r3);
const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
out_r0r2r1r3 = highbd_clamp_epi16(out_r0r2r1r3, _mm256_setzero_si256(), max);
// Store the output.
const int dst_id = block_row_begin * dst_stride + block_col_begin;
const __m128i out_r1r3 = _mm256_extractf128_si256(out_r0r2r1r3, 1);
_mm_storel_epi64((__m128i *)(dst + dst_id),
_mm256_castsi256_si128(out_r0r2r1r3));
_mm_storel_epi64((__m128i *)(dst + dst_id + (1 * dst_stride)), out_r1r3);
_mm_storel_epi64((__m128i *)(dst + dst_id + (2 * dst_stride)),
_mm_bsrli_si128(_mm256_castsi256_si128(out_r0r2r1r3), 8));
_mm_storel_epi64((__m128i *)(dst + dst_id + (3 * dst_stride)),
_mm_bsrli_si128(out_r1r3, 8));
}
// Implementation of DIAMOND shaped 7-tap (6 symmetric + 1) filtering for block
// size of 4x4. The output for a particular pixel in a 4x4 block is calculated
// by considering a 5x5 grid surrounded by that pixel. The filter taps are
// expected to be passed as symmetric taps followed by center tap. The exact
// order of filter taps needed is shown below.
// Filter Coefficients: fc0 fc1 fc2 fc3 fc4 fc5 fc
// Here, 'fc0-fc5' corresponds to symmetric taps and 'fc' is the center tap.
static AOM_INLINE void av1_convolve_symmetric_highbd_7tap_avx2(
const uint16_t *dgd, int stride, const NonsepFilterConfig *filter_config,
const int16_t *filter, uint16_t *dst, int dst_stride, int bit_depth,
int block_row_begin, int block_col_begin) {
// Derive singleton_tap.
int32_t singleton_tap = 1 << filter_config->prec_bits;
if (filter_config->num_pixels % 2) {
const int singleton_tap_index =
filter_config->config[filter_config->num_pixels - 1][NONSEP_BUF_POS];
singleton_tap += filter[singleton_tap_index];
}
// Load filter tap values.
// fc0 fc1 fc2 fc3 fc4 fc5 center_tap x
const __m128i filt_coeff_0 = _mm_loadu_si128((__m128i const *)(filter));
// Replace the center_tap with derived singleton_tap.
const __m128i center_tap = _mm_set1_epi16(singleton_tap);
const __m128i filt_coeff = _mm_blend_epi16(filt_coeff_0, center_tap, 0x40);
// Initializing the output registers with zero
__m256i accum_out_r0r1 = _mm256_setzero_si256();
__m256i accum_out_r2r3 = _mm256_setzero_si256();
// Perform 7-tap filtering on source buffer
apply_7tap_filtering_with_subtract_center_off(
dgd, stride, filt_coeff, &accum_out_r0r1, &accum_out_r2r3,
block_row_begin, block_col_begin);
// Store the output after rounding and clipping
round_and_store_avx2(dst, dst_stride, filter_config, bit_depth,
accum_out_r0r1, accum_out_r2r3, block_row_begin,
block_col_begin);
}
// AVX2 intrinsic to convolve a block of pixels with origin-symmetric,
// non-separable filters. The output for a particular pixel in a 4x4 block is
// calculated with DIAMOND shaped filter considering a 7x7 grid surrounded by
// that pixel. DIAMOND shape uses 13-tap filter for convolution. The filter taps
// are expected to be passed as symmetric taps followed by center tap. The exact
// order of filter taps needed is shown below.
// Filter Coefficients: fc0 fc1 fc2 fc3 fc4 fc5 fc6 fc7 fc8 fc9 fc10 fc11 fc
// Here, 'fc0-fc11' corresponds to symmetric taps and 'fc' is the center tap.
// The following describes the design considered for intrinsic implementation.
// Load Source Data:
// src_ra0 = a0 a1 a2 a3 a4 a5 a6 a7
// src_rb0 = b0 b1 b2 b3 b4 b5 b6 b7
// src_rc0 = c0 c1 c2 c3 c4 c5 c6 c7
// src_rd0 = d0 d1 d2 d3 d4 d5 d6 d7
// src_re0 = e0 e1 e2 e3 e4 e5 e6 e7
// src_rf0 = f0 f1 f2 f3 f4 f5 f6 f7
// src_rg0 = g0 g1 g2 g3 g4 g5 g6 g7
// The output for a pixel located at d3 position is calculated as below.
// Filtered_d3 = (a3+g3)*fc0 + (b2+f4)*fc1 + (b3+f3)*fc2 + (b4+f5)*fc3 +
// (c1+e5)*fc4 + (c2+e4)*fc5 + (c3+e3)*fc6 + (c4+e2)*fc7 + (c5+e1)*fc8 +
// (d0+d6)*fc9 + (d1+d5)*fc10 + (d2+d4)*fc11 + d3*fc.
// The source registers are unpacked such that the output corresponding
// to 2 rows will be produced in a single register (i.e., processing 2 rows
// simultaneously).
//
// Example:
// The output corresponding to fc0 of rows 0 and 1 is achieved like below.
// __m256i src_rag3 = a3 g3 a4 g4 a5 g5 a6 g6 | b3 h3 b4 h4 b5 h5 b6 h6
// __m256i filter_0 = f0 f0 f0 f0 f0 f0 f0 f0 | f0 f0 f0 f0 f0 f0 f0 f0
// __m256i out_f0_0 = _mm256_madd_epi16(src_rag3, filter_0);
// = (a3*f0+g3*f0) (a4*f0+g4*f0) .. | (b3*f0+h3*f0) . .
// Here, out_f0_0 contains partial output of rows 1 and 2 corresponding to fc0.
static AOM_INLINE void av1_convolve_symmetric_highbd_13tap_avx2(
const uint16_t *dgd, int stride, const NonsepFilterConfig *filter_config,
const int16_t *filter, uint16_t *dst, int dst_stride, int bit_depth,
int block_row_begin, int block_col_begin) {
// Derive singleton_tap.
int32_t singleton_tap = 1 << filter_config->prec_bits;
if (filter_config->num_pixels % 2) {
const int singleton_tap_index =
filter_config->config[filter_config->num_pixels - 1][NONSEP_BUF_POS];
singleton_tap += filter[singleton_tap_index];
}
// Load source data.
int src_index_start = block_row_begin * stride + block_col_begin;
const uint16_t *src_ptr = dgd + src_index_start - 3 * stride - 3;
const __m128i src_a0 = _mm_loadu_si128((__m128i const *)src_ptr);
const __m128i src_b0 = _mm_loadu_si128((__m128i const *)(src_ptr + stride));
const __m128i src_c0 =
_mm_loadu_si128((__m128i const *)(src_ptr + 2 * stride));
const __m128i src_d0 =
_mm_loadu_si128((__m128i const *)(src_ptr + 3 * stride));
const __m128i src_e0 =
_mm_loadu_si128((__m128i const *)(src_ptr + 4 * stride));
const __m128i src_f0 =
_mm_loadu_si128((__m128i const *)(src_ptr + 5 * stride));
const __m128i src_g0 =
_mm_loadu_si128((__m128i const *)(src_ptr + 6 * stride));
const __m128i src_h0 =
_mm_loadu_si128((__m128i const *)(src_ptr + 7 * stride));
const __m128i src_i0 =
_mm_loadu_si128((__m128i const *)(src_ptr + 8 * stride));
const __m128i src_j0 =
_mm_loadu_si128((__m128i const *)(src_ptr + 9 * stride));
// Load filter tap values.
// fc0 fc1 fc2 fc3 fc4 fc5 fc6 fc7
const __m128i filt_coeff_0 = _mm_loadu_si128((__m128i const *)(filter));
// fc5 fc6 fc7 fc8 fc9 fc10 fc11 fc12
__m128i temp = _mm_loadu_si128((__m128i const *)(filter + 5));
// Replace the fc12 with derived singleton_tap.
const __m128i center_tap = _mm_set1_epi16(singleton_tap);
const __m128i filt_coeff_1 = _mm_blend_epi16(temp, center_tap, 0x80);
// Form 256bit source registers.
// a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 b7
const __m256i src_ab =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_a0), src_b0, 1);
// c0 c1 c2 c3 c4 c5 c6 c7 | d0 d1 d2 d3 d4 d5 d6 d7
const __m256i src_cd =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_c0), src_d0, 1);
// e0 e1 e2 e3 e4 e5 e6 e7 | f0 f1 f2 f3 f4 f5 f6 f7
const __m256i src_ef =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_e0), src_f0, 1);
// g0 g1 g2 g3 g4 g5 g6 g7 | h0 h1 h2 h3 h4 h5 h6 h7
const __m256i src_gh =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_g0), src_h0, 1);
// i0 i1 i2 i3 i4 i5 i6 i7 | j0 j1 j2 j3 j4 j5 j6 j7
const __m256i src_ij =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_i0), src_j0, 1);
// Packing the source rows.
// a0 g0 a1 g1 a2 g2 a3 g3 | b0 h0 b1 h1 b2 h2 b3 h3
const __m256i ru0 = _mm256_unpacklo_epi16(src_ab, src_gh);
// a4 g4 a5 g5 a6 g6 a7 g7 | b4 h4 b5 h5 b6 h6 b7 h7
const __m256i ru1 = _mm256_unpackhi_epi16(src_ab, src_gh);
// c0 e0 c1 e1 c2 e2 c3 e3 | d0 f0 d1 f1 d2 f2 d3 f3
const __m256i ru2 = _mm256_unpacklo_epi16(src_cd, src_ef);
// c4 e4 c5 e5 c6 e6 c7 e7 | d4 f4 d5 f5 d6 f6 d7 f7
const __m256i ru3 = _mm256_unpackhi_epi16(src_cd, src_ef);
// c0 i0 c1 i1 c2 i2 c3 i3 | d0 j0 d1 j1 d2 j2 d3 j3
const __m256i ru4 = _mm256_unpacklo_epi16(src_cd, src_ij);
// c4 i4 c5 i5 c6 i6 c7 i7 | d4 j4 d5 j5 d6 j6 d7 j7
const __m256i ru5 = _mm256_unpackhi_epi16(src_cd, src_ij);
// e0 g0 e1 g1 e2 g2 e3 g3 | f0 h0 f1 h1 f2 h2 f3 h3
const __m256i ru6 = _mm256_unpacklo_epi16(src_ef, src_gh);
// e4 g4 e5 g5 e6 g6 e7 g7 | f4 h4 f5 h5 f6 h6 f7 h7
const __m256i ru7 = _mm256_unpackhi_epi16(src_ef, src_gh);
// Output corresponding to filter coefficient 0.
// a3 g3 a4 g4 a5 g5 a6 g6 | b3 h3 b4 h4 b5 h5 b6 h6
const __m256i ru8 = _mm256_alignr_epi8(ru1, ru0, 12);
// c3 i3 c4 i4 c5 i5 c6 i6 | d3 j3 d4 j4 d5 j5 d6 j6
const __m256i ru9 = _mm256_alignr_epi8(ru5, ru4, 12);
// f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0
const __m256i fc0 = _mm256_broadcastw_epi16(filt_coeff_0);
// r00 r01 r02 r03 | r10 r11 r12 r13
__m256i accum_out_r0r1 = _mm256_madd_epi16(ru8, fc0);
// r20 r21 r22 r23 | r30 r31 r32 r33
__m256i accum_out_r2r3 = _mm256_madd_epi16(ru9, fc0);
// Output corresponding to filter coefficients 4,5,6,7.
// c1 e1 c2 e2 c3 e3 c4 e4 | d1 f1 d2 f2 d3 f3 d4 f4
const __m256i ru10 = _mm256_alignr_epi8(ru3, ru2, 4);
// e1 g1 e2 g2 e3 g3 e4 g4 | f1 h1 f2 h2 f3 h3 f4 h4
const __m256i ru11 = _mm256_alignr_epi8(ru7, ru6, 4);
// c2 e2 c3 e3 c4 e4 c5 e5 | d2 f2 d3 f3 d4 f4 d5 f5
const __m256i ru12 = _mm256_alignr_epi8(ru3, ru2, 8);
// e2 g2 e3 g3 e4 g4 e5 g5 | f2 h2 f3 h3 f4 h4 f5 h5
const __m256i ru13 = _mm256_alignr_epi8(ru7, ru6, 8);
// c3 e3 c4 e4 c5 e5 c6 e6 | d3 f3 d4 f4 d5 f5 d6 f6
const __m256i ru14 = _mm256_alignr_epi8(ru3, ru2, 12);
// e3 g3 e4 g4 e5 g5 e6 g6 | f3 h3 f4 h4 f5 h5 f6 h6
const __m256i ru15 = _mm256_alignr_epi8(ru7, ru6, 12);
// 0 fc5 fc6 fc7 fc8 fc9 fc10 fc11
temp = _mm_bslli_si128(filt_coeff_1, 2);
// fc4 fc8 fc5 fc9 fc6 fc10 fc7 fc11
temp = _mm_unpackhi_epi16(filt_coeff_0, temp);
// fc4 fc8 fc4 fc8 fc4 fc8 fc4 fc8 | fc4 fc8 fc4 fc8 fc4 fc8 fc4 fc8
const __m256i filt48 = _mm256_broadcastd_epi32(temp);
// fc5 fc7 fc5 fc7 fc5 fc7 fc5 fc7 | fc5 fc7 fc5 fc7 fc5 fc7 fc5 fc7
const __m256i filt57 =
_mm256_broadcastd_epi32(_mm_shufflelo_epi16(filt_coeff_1, 0x08));
// fc6 fc6 fc6 fc6 fc6 fc6 fc6 fc6 | fc6 fc6 fc6 fc6 fc6 fc6 fc6 fc6
const __m256i filt66 =
_mm256_broadcastd_epi32(_mm_shufflelo_epi16(filt_coeff_1, 0x55));
// fc7 fc5 fc7 fc5 fc7 fc5 fc7 fc5 | fc7 fc5 fc7 fc5 fc7 fc5 fc7 fc5
const __m256i filt75 =
_mm256_broadcastd_epi32(_mm_shufflelo_epi16(filt_coeff_1, 0x22));
const __m256i res_0 = _mm256_madd_epi16(ru10, filt48);
const __m256i res_1 = _mm256_madd_epi16(ru11, filt48);
const __m256i res_2 = _mm256_madd_epi16(ru12, filt57);
const __m256i res_3 = _mm256_madd_epi16(ru13, filt57);
const __m256i res_4 = _mm256_madd_epi16(ru14, filt66);
const __m256i res_5 = _mm256_madd_epi16(ru15, filt66);
const __m256i res_6 = _mm256_madd_epi16(ru3, filt75);
const __m256i res_7 = _mm256_madd_epi16(ru7, filt75);
// r00 r01 r02 r03 | r10 r11 r12 r13
const __m256i out_0 = _mm256_add_epi32(res_0, res_2);
const __m256i out_1 = _mm256_add_epi32(res_4, res_6);
const __m256i out_2 = _mm256_add_epi32(out_0, out_1);
accum_out_r0r1 = _mm256_add_epi32(accum_out_r0r1, out_2);
// r20 r21 r22 r23 | r30 r31 r32 r33
const __m256i out_3 = _mm256_add_epi32(res_1, res_3);
const __m256i out_4 = _mm256_add_epi32(res_5, res_7);
const __m256i out_5 = _mm256_add_epi32(out_3, out_4);
accum_out_r2r3 = _mm256_add_epi32(accum_out_r2r3, out_5);
// Output corresponding to filter coefficients 9,10,11,12.
// d2 d3 d4 d5 d6 d7 d8 d9
const __m128i src_d2 =
_mm_loadu_si128((__m128i const *)(src_ptr + 3 * stride + 2));
// e2 e3 e4 e5 e6 e7 e8 e9
const __m128i src_e2 =
_mm_loadu_si128((__m128i const *)(src_ptr + 4 * stride + 2));
// f2 f3 f4 f5 f6 f7 f8 f9
const __m128i src_f2 =
_mm_loadu_si128((__m128i const *)(src_ptr + 5 * stride + 2));
// g2 g3 g4 g5 g6 g7 g8 g9
const __m128i src_g2 =
_mm_loadu_si128((__m128i const *)(src_ptr + 6 * stride + 2));
// d0 d1 d2 d3 d4 d5 d6 d7 | e0 e1 e2 e3 e4 e5 e6 e7
const __m256i rm0 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_d0), src_e0, 1);
// d2 d3 d4 d5 d6 d7 d8 d9 | e2 e3 e4 e5 e6 e7 e8 e9
const __m256i rm2 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_d2), src_e2, 1);
// f0 f1 f2 f3 f4 f5 f6 f7 | g0 g1 g2 g3 g4 g5 g6 g7
const __m256i rm00 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_f0), src_g0, 1);
// f2 f3 f4 f5 f6 f7 f8 f9 | g2 g3 g4 g5 g6 g7 g8 g9
const __m256i rm22 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_f2), src_g2, 1);
// d1 d2 d3 d4 d5 d6 d7 d8 | e1 e2 e3 e4 e5 e6 e7 e8
const __m256i rm1 = _mm256_alignr_epi8(_mm256_bsrli_epi128(rm2, 12), rm0, 2);
const __m256i rm11 =
_mm256_alignr_epi8(_mm256_bsrli_epi128(rm22, 12), rm00, 2);
// d3 d4 d5 d6 d7 d8 d9 0 | e3 e4 e5 e6 e7 e8 e9 0
const __m256i rm3 = _mm256_bsrli_epi128(rm2, 2);
const __m256i rm33 = _mm256_bsrli_epi128(rm22, 2);
// d0 d1 d1 d2 d2 d3 d3 d4 | e0 e1 e1 e2 e2 e3 e3 e4
__m256i rm4 = _mm256_unpacklo_epi16(rm0, rm1);
// d2 d3 d3 d4 d4 d5 d5 d6 | e2 e3 e3 e4 e4 e5 e5 e6
__m256i rm5 = _mm256_unpacklo_epi16(rm2, rm3);
// d4 d5 d5 d6 d6 d7 d7 d8 | e4 e5 e5 e6 e6 e7 e7 e8
__m256i rm6 = _mm256_unpackhi_epi16(rm0, rm1);
// d6 0 d7 0 d8 0 d9 0 | e6 0 e7 0 e8 0 e9 0
__m256i rm7 = _mm256_unpackhi_epi16(rm2, _mm256_set1_epi16(0));
__m256i rm44 = _mm256_unpacklo_epi16(rm00, rm11);
__m256i rm55 = _mm256_unpacklo_epi16(rm22, rm33);
__m256i rm66 = _mm256_unpackhi_epi16(rm00, rm11);
__m256i rm77 = _mm256_unpackhi_epi16(rm22, _mm256_set1_epi16(0));
// fc9 fc10 - - - - - - | fc9 fc10 - - - - - -
const __m256i fc910 =
_mm256_broadcastd_epi32(_mm_bsrli_si128(filt_coeff_1, 8));
// fc11 fc12 - - - - - - | fc11 fc12 - - - - - -
const __m256i fc1112 =
_mm256_broadcastd_epi32(_mm_bsrli_si128(filt_coeff_1, 12));
// fc11 fc10 - - - - - - | fc11 fc10 - - - - - -
const __m256i fc1110 = _mm256_broadcastd_epi32(
_mm_bsrli_si128(_mm_shufflehi_epi16(filt_coeff_1, 0x06), 8));
rm4 = _mm256_madd_epi16(rm4, fc910);
rm5 = _mm256_madd_epi16(rm5, fc1112);
rm6 = _mm256_madd_epi16(rm6, fc1110);
rm7 = _mm256_madd_epi16(rm7, fc910);
rm44 = _mm256_madd_epi16(rm44, fc910);
rm55 = _mm256_madd_epi16(rm55, fc1112);
rm66 = _mm256_madd_epi16(rm66, fc1110);
rm77 = _mm256_madd_epi16(rm77, fc910);
// r00 r01 r02 r03 | r10 r11 r12 r13
rm4 = _mm256_add_epi32(rm4, rm5);
rm6 = _mm256_add_epi32(rm6, rm7);
rm4 = _mm256_add_epi32(rm4, rm6);
accum_out_r0r1 = _mm256_add_epi32(accum_out_r0r1, rm4);
// r20 r21 r22 r23 | r30 r31 r32 r33
rm44 = _mm256_add_epi32(rm44, rm55);
rm66 = _mm256_add_epi32(rm66, rm77);
rm44 = _mm256_add_epi32(rm44, rm66);
accum_out_r2r3 = _mm256_add_epi32(accum_out_r2r3, rm44);
// Output corresponding to filter coefficients 1,2,3,8.
const __m128i src_b2 =
_mm_loadu_si128((__m128i const *)(src_ptr + 1 * stride + 2));
const __m128i src_c2 =
_mm_loadu_si128((__m128i const *)(src_ptr + 2 * stride + 2));
const __m256i rn0 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_b2), src_c2, 1);
// b2 b3 b3 b4 b4 b5 b5 b6 | c2 c3 c3 c4 c4 c5 c5 c6
__m256i r0 = _mm256_unpacklo_epi16(rn0, _mm256_bsrli_epi128(rn0, 2));
const __m256i rcd2 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_c2), src_d2, 1);
// b4 c5 b5 c6 b6 c7 b7 c8 | c4 d5 c5 d6 c6 d7 c7 d8
__m256i r1 = _mm256_unpacklo_epi16(_mm256_bsrli_epi128(rn0, 4),
_mm256_bsrli_epi128(rcd2, 6));
const __m256i rfg2 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_f2), src_g2, 1);
// f2 f3 f3 f4 f4 f5 f5 f6 | g2 g3 g3 g4 g4 g5 g5 g6
__m256i r2 = _mm256_unpacklo_epi16(rfg2, _mm256_bsrli_epi128(rfg2, 2));
const __m256i ref2 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_e2), src_f2, 1);
// f4 e5 f5 e6 f6 e7 f7 e8 | g4 f5 g5 f6 g6 f7 g7 f8
__m256i r3 = _mm256_unpacklo_epi16(_mm256_bsrli_epi128(rfg2, 4),
_mm256_bsrli_epi128(ref2, 6));
const __m128i tempn =
_mm_blend_epi16(_mm_bsrli_si128(filt_coeff_0, 2), filt_coeff_1, 0x08);
__m128i tempn2 = _mm_bsrli_si128(filt_coeff_0, 2);
tempn2 = _mm_shufflelo_epi16(tempn2, 0x0c);
// fc1 fc2 - - - - - - | fc1 fc2 - - - - - -
const __m256i fc12 = _mm256_broadcastd_epi32(tempn);
// fc1 fc4 - - - - - - | fc1 fc4 - - - - - -
const __m256i fc14 = _mm256_broadcastd_epi32(tempn2);
tempn2 = _mm_shufflelo_epi16(tempn, 0x06);
// fc3 fc8 - - - - - - | fc3 fc8 - - - - - -
const __m256i fc38 = _mm256_broadcastd_epi32(_mm_bsrli_si128(tempn, 4));
// fc3 fc2 - - - - - - | fc3 fc2 - - - - - -
const __m256i fc32 = _mm256_broadcastd_epi32(tempn2);
r0 = _mm256_madd_epi16(r0, fc12);
r1 = _mm256_madd_epi16(r1, fc38);
r2 = _mm256_madd_epi16(r2, fc32);
r3 = _mm256_madd_epi16(r3, fc14);
// r00 r01 r02 r03 | r10 r11 r12 r13
r0 = _mm256_add_epi32(r0, r1);
r2 = _mm256_add_epi32(r2, r3);
r0 = _mm256_add_epi32(r0, r2);
accum_out_r0r1 = _mm256_add_epi32(accum_out_r0r1, r0);
const __m256i rn1 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_d2), src_e2, 1);
// d2 d3 d3 d4 d4 d5 d5 d6 | e2 e3 e3 e4 e4 e5 e5 e6
__m256i r00 = _mm256_unpacklo_epi16(rn1, _mm256_bsrli_epi128(rn1, 2));
// d4 e5 d5 e6 d6 e7 d7 e8 | e4 f5 e5 f6 e6 f7 e7 f8
__m256i r11 = _mm256_unpacklo_epi16(_mm256_bsrli_epi128(rn1, 4),
_mm256_bsrli_epi128(ref2, 6));
const __m128i src_h2 =
_mm_loadu_si128((__m128i const *)(src_ptr + 7 * stride + 2));
const __m128i src_i2 =
_mm_loadu_si128((__m128i const *)(src_ptr + 8 * stride + 2));
// h2 h3 h4 h5 h6 h7 h8 h9 | i2 i3 i4 i5 i6 i7 i8 i9
__m256i rhi2 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_h2), src_i2, 1);
// h2 h3 h3 h4 h4 h5 h5 h6 | i2 i3 i3 i4 i4 i5 i5 i6
__m256i r22 = _mm256_unpacklo_epi16(rhi2, _mm256_bsrli_epi128(rhi2, 2));
// g2 g3 g4 g5 g6 g7 g8 g9 | h2 h3 h4 h5 h6 h7 h8 h9
__m256i rgh2 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_g2), src_h2, 1);
__m256i r33 = _mm256_unpacklo_epi16(_mm256_bsrli_epi128(rhi2, 4),
_mm256_bsrli_epi128(rgh2, 6));
r00 = _mm256_madd_epi16(r00, fc12);
r11 = _mm256_madd_epi16(r11, fc38);
r22 = _mm256_madd_epi16(r22, fc32);
r33 = _mm256_madd_epi16(r33, fc14);
// r20 r21 r22 r23 | r30 r31 r32 r33
r00 = _mm256_add_epi32(r00, r11);
r22 = _mm256_add_epi32(r22, r33);
r00 = _mm256_add_epi32(r00, r22);
accum_out_r2r3 = _mm256_add_epi32(accum_out_r2r3, r00);
// Rounding and clipping.
accum_out_r0r1 =
round_power_of_two_signed_avx2(accum_out_r0r1, filter_config->prec_bits);
accum_out_r2r3 =
round_power_of_two_signed_avx2(accum_out_r2r3, filter_config->prec_bits);
// r00 r01 r02 r03 | r20 r21 r22 r23 | r10 r11 r12 r13 | r30 r31 r32 r33
__m256i out_r0r2r1r3 = _mm256_packs_epi32(accum_out_r0r1, accum_out_r2r3);
const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
out_r0r2r1r3 = highbd_clamp_epi16(out_r0r2r1r3, _mm256_setzero_si256(), max);
// Store the output.
const int dst_id = block_row_begin * dst_stride + block_col_begin;
const __m128i out_r1r3 = _mm256_extractf128_si256(out_r0r2r1r3, 1);
_mm_storel_epi64((__m128i *)(dst + dst_id),
_mm256_castsi256_si128(out_r0r2r1r3));
_mm_storel_epi64((__m128i *)(dst + dst_id + (1 * dst_stride)), out_r1r3);
_mm_storel_epi64((__m128i *)(dst + dst_id + (2 * dst_stride)),
_mm_bsrli_si128(_mm256_castsi256_si128(out_r0r2r1r3), 8));
_mm_storel_epi64((__m128i *)(dst + dst_id + (3 * dst_stride)),
_mm_bsrli_si128(out_r1r3, 8));
}
// SIMD implementation to convolve a block of pixels with origin-symmetric, pc
// wiener filter corresponds to CONFIG_PC_WIENER loop restoration. DIAMOND shape
// with 13-tap (12 symmetric+1) or 7-tap (6 symmetric + 1) filter is used for
// convolution.
void av1_convolve_symmetric_highbd_avx2(const uint16_t *dgd, int stride,
const NonsepFilterConfig *filter_config,
const int16_t *filter, uint16_t *dst,
int dst_stride, int bit_depth,
int block_row_begin, int block_row_end,
int block_col_begin,
int block_col_end) {
assert(!filter_config->subtract_center);
const int num_rows = block_row_end - block_row_begin;
const int num_cols = block_col_end - block_col_begin;
const int num_sym_taps = filter_config->num_pixels / 2;
// SIMD is mainly implemented for diamond shape filter with 13 taps (12
// symmetric + 1) or 7 taps (6 symmetric + 1) for a block size of 4x4. For any
// other cases invoke the C function.
if (num_rows != 4 || num_cols != 4 ||
(num_sym_taps != 12 && num_sym_taps != 6)) {
av1_convolve_symmetric_highbd_c(
dgd, stride, filter_config, filter, dst, dst_stride, bit_depth,
block_row_begin, block_row_end, block_col_begin, block_col_end);
return;
}
// 7-tap implementation (6 symmetric + 1) for convolve symmetric with subtract
// center off.
if (num_sym_taps == 6) {
av1_convolve_symmetric_highbd_7tap_avx2(dgd, stride, filter_config, filter,
dst, dst_stride, bit_depth,
block_row_begin, block_col_begin);
return;
}
// 13-tap implementation (12 symmetric + 1) for convolve symmetric with
// subtract center off.
if (num_sym_taps == 12) {
av1_convolve_symmetric_highbd_13tap_avx2(dgd, stride, filter_config, filter,
dst, dst_stride, bit_depth,
block_row_begin, block_col_begin);
return;
}
}
// AVX2 intrinsic for convolve wiener non-separable dual loop restoration
// filtering with subtract center off. Two buffers (dgd and dgd_dual) are
// considered for dual filtering, where each buffer goes through 7-tap
// filtering. The output for a particular pixel in a 4x4 block is calculated
// with DIAMOND shaped filter considering a 5x5 grid surrounded by that pixel.
// A total of 14 filter taps required and are expected to be passed as filter
// taps required for dgd(first) buffer followed by dgd_dual(second) buffer
// filter taps. The exact order of filter taps needed is shown below.
// Filter Taps: fc0 fc1 fc2 fc3 fc4 fc5 fc6 fc7 fc8 fc9 fc10 fc11 fc12 fc13.
// Here, 'fc0-fc6' and 'fc7-fc13' are corresponding to filter taps used for
// dgd and dgd_dual buffers respectively. Also, 'fc0-fc5' and 'fc7-fc12' are the
// symmetric taps and 'fc6' and 'fc13' are the center taps correspond to dgd and
// dgd_dual buffers respectively.
//
// The following describes the algorithm briefly.
// 7-tap filtering for dgd (first) buffer:
// Load Source Data:
// dgd_ra0 = a0 a1 a2 a3 a4 a5 a6 a7
// dgd_rb0 = b0 b1 b2 b3 b4 b5 b6 b7
// dgd_rc0 = c0 c1 c2 c3 c4 c5 c6 c7
// dgd_rd0 = d0 d1 d2 d3 d4 d5 d6 d7
// dgd_re0 = e0 e1 e2 e3 e4 e5 e6 e7
// dgd_rf0 = f0 f1 f2 f3 f4 f5 f6 f7
// dgd_rg0 = g0 g1 g2 g3 g4 g5 g6 g7
// The output for a pixel located at c2 position is calculated as below.
// dgd_output_c2 = ((a2+e2)*fc4 + (b1+d3)*fc2 + (b2+d2)*fc0 + (b3+d1)*fc3 +
// (c0+c4)*fc5 + (c1+c3)*fc1 + c2*fc6(singleton_tap)
//
// 7-tap filtering for dgd_dual (second) buffer:
// Load Source Data:
// dgd_dual_ra0 = a0 a1 a2 a3 a4 a5 a6 a7
// dgd_dual_rb0 = b0 b1 b2 b3 b4 b5 b6 b7
// dgd_dual_rc0 = c0 c1 c2 c3 c4 c5 c6 c7
// dgd_dual_rd0 = d0 d1 d2 d3 d4 d5 d6 d7
// dgd_dual_re0 = e0 e1 e2 e3 e4 e5 e6 e7
// dgd_dual_rf0 = f0 f1 f2 f3 f4 f5 f6 f7
// dgd_dual_rg0 = g0 g1 g2 g3 g4 g5 g6 g7
// dgd_dual_output_c2 = (a2+e2)*fc11 + (b1+d3)*fc9 + (b2+d2)*fc7 + (b3+d1)*fc10
// + (c0+c4)*fc12 + (c1+c3)*fc8 + c2*fc13((singleton_tap_dual)
// The final output corresponding to c2 is calculated as below.
// output_c2 = dgd_output_c2 + dgd_dual_output_c2
// The source registers are unpacked such that the output corresponding to 2
// rows will be produced in a single register (i.e., processing 2 rows
// simultaneously).
//
// Example:
// The dgd(first) buffer output corresponding to fc4 of rows 0 and 1 is achieved
// like below.
// __m256i src_reg3 = a2 e2 a3 e3 a4 e4 a5 e5 | b2 f2 b3 f3 b4 f4 b5 f5
// __m256i filter_4 = fc4 fc4 fc4 fc4 fc4 fc4 fc4 fc4 | fc4 fc4 fc4 fc4 fc4
// fc4 fc4 fc4
// __m256i out_f4_01 = _mm256_madd_epi16(src_reg3, filter_4);
// = (a2*fc4+e2*fc4) (a3*fc4+e3*fc4) .. | (b2*fc4+f2*fc4)
// . .
// Here, out_f4_01 contains partial output of rows 0 and 1 corresponding to
// fc4.
void av1_convolve_symmetric_dual_highbd_avx2(
const uint16_t *dgd, int dgd_stride, const uint16_t *dgd_dual,
int dgd_dual_stride, const NonsepFilterConfig *filter_config,
const int16_t *filter, uint16_t *dst, int dst_stride, int bit_depth,
int block_row_begin, int block_row_end, int block_col_begin,
int block_col_end) {
assert(!filter_config->subtract_center);
const int num_rows = block_row_end - block_row_begin;
const int num_cols = block_col_end - block_col_begin;
assert(num_rows >= 0 && num_cols >= 0);
const int num_sym_taps = filter_config->num_pixels / 2;
const int num_sym_taps_dual = filter_config->num_pixels2 / 2;
// SIMD is mainly implemented for diamond shape filter using 7-tap filtering
// for each of first(dgd) and second(dgd_dual) buffer for 4x4 block size. For
// any other cases invoke the C function.
if (num_rows != 4 || num_cols != 4 || num_sym_taps != 6 ||
num_sym_taps_dual != 6) {
av1_convolve_symmetric_dual_highbd_c(
dgd, dgd_stride, dgd_dual, dgd_dual_stride, filter_config, filter, dst,
dst_stride, bit_depth, block_row_begin, block_row_end, block_col_begin,
block_col_end);
return;
}
// Derive singleton_tap and singleton_tap_dual and repalce center tap filter
// values with the same.
int32_t singleton_tap = 1 << filter_config->prec_bits;
int32_t singleton_tap_dual = 0;
if (filter_config->num_pixels % 2) {
const int singleton_tap_index =
filter_config->config[filter_config->num_pixels - 1][NONSEP_BUF_POS];
singleton_tap += filter[singleton_tap_index];
}
if (filter_config->num_pixels2 % 2) {
const int last_config = filter_config->num_pixels2 - 1;
const int singleton_tap_index =
filter_config->config2[last_config][NONSEP_BUF_POS];
singleton_tap_dual += filter[singleton_tap_index];
}
// Prepare filter coefficients for dgd(first) buffer 7-tap filtering
// fc0 fc1 fc2 fc3 fc4 fc5 fc6(center_tap) x
__m128i filter_coeff = _mm_loadu_si128((__m128i const *)(filter));
// Replace the center_tap with derived singleton_tap.
const __m128i center_tap1 = _mm_set1_epi16(singleton_tap);
const __m128i filter_coeff_dgd =
_mm_blend_epi16(filter_coeff, center_tap1, 0x40);
// Prepare filter coefficients for dgd_dual(second) buffer 7-tap filtering
// fc7 fc8 fc9 fc10 fc11 fc12 fc13(center_tap) x
filter_coeff = _mm_loadu_si128((__m128i const *)(filter + 6));
filter_coeff = _mm_bsrli_si128(filter_coeff, 2);
// Replace the center_tap with derived singleton_tap_dual.
const __m128i center_tap2 = _mm_set1_epi16(singleton_tap_dual);
const __m128i filter_coeff_dgd_dual =
_mm_blend_epi16(filter_coeff, center_tap2, 0x40);
// Initialize the output registers with zero.
__m256i accum_out_r0r1 = _mm256_setzero_si256();
__m256i accum_out_r2r3 = _mm256_setzero_si256();
// 7-tap filtering for dgd (first) buffer.
apply_7tap_filtering_with_subtract_center_off(
dgd, dgd_stride, filter_coeff_dgd, &accum_out_r0r1, &accum_out_r2r3,
block_row_begin, block_col_begin);
// 7-tap filtering for dgd_dual (second) buffer.
apply_7tap_filtering_with_subtract_center_off(
dgd_dual, dgd_dual_stride, filter_coeff_dgd_dual, &accum_out_r0r1,
&accum_out_r2r3, block_row_begin, block_col_begin);
// Store the output after rounding and clipping.
round_and_store_avx2(dst, dst_stride, filter_config, bit_depth,
accum_out_r0r1, accum_out_r2r3, block_row_begin,
block_col_begin);
}
// TODO(Arun Negi): Difference of source and center pixel needs to go through
// clip_base(). Implement clip_base() in intrinsic once the support is added.
//
// Implementation of DIAMOND shaped 6-tap filtering for block size of 4x4.
// The output for a particular pixel in a 4x4 block is calculated by considering
// a 5x5 grid surrounded by that pixel. The registers accum_out_r0r1 and
// accum_out_r2r3 are used to store the output. The following describes the
// algorithm briefly.
// Filter Coefficients: fc0 fc1 fc2 fc3 fc4 fc5 x x
// Load Source Data:
// src_ra0 = a0 a1 a2 a3 a4 a5 a6 a7
// src_rb0 = b0 b1 b2 b3 b4 b5 b6 b7
// src_rc0 = c0 c1 c2 c3 c4 c5 c6 c7
// src_rd0 = d0 d1 d2 d3 d4 d5 d6 d7
// src_re0 = e0 e1 e2 e3 e4 e5 e6 e7
// src_rf0 = f0 f1 f2 f3 f4 f5 f6 f7
// src_rg0 = g0 g1 g2 g3 g4 g5 g6 g7
// The output for a pixel located at c2 position is calculated as below.
// Filtered_c2 = ((a2-c2)+(e2-c2))*fc4 + ((b1-c2+d3-c2))*fc2 +
// (b2-c2+d2-c2)*fc0 + (b3-c2+d1-c2)*fc3 + (c0-c2+c4-c2)*fc5 +
// (c1-c2+c3-c2)*fc1 + c2*singleton_tap + dc_offset
// The source registers are unpacked such that the output corresponding to 2
// rows will be produced in a single register (i.e., processing 2 rows
// simultaneously).
//
// Example:
// The output corresponding to fc4 of rows 0 and 1 is achieved like below.
// __m256i centerpixel_row01 = c2 c2 c3 c3 c4 c4 c5 c5 | d2 d2 d3 d3 d4 d4 d5 d5
// __m256i src_reg3 = a2 e2 a3 e3 a4 e4 a5 e5 | b2 f2 b3 f3 b4 f4 b5 f5
// __m256i filter_4 = fc4 fc4 fc4 fc4 fc4 fc4 fc4 fc4 | fc4 fc4 fc4 fc4 fc4 fc4
// fc4 fc4
// __m256 src_reg3 = _mm256_sub_epi16(src_reg3, centerpixel_row01);
// __m256i out_f4_01 = _mm256_madd_epi16(src_reg3, filter_4);
// = ((a2-c2)*fc4+(e2-c2)*fc4) (a3-c3)*fc4+(e3-c3)*fc4) .. |
// (b2-d2)*fc4 +(f2-d2)*fc4) . .
// Here, out_f4_01 contains partial output of rows 0 and 1 corresponding to fc4.
static INLINE void apply_6tap_filtering(const uint16_t *dgd, int stride,
const __m128i filt_coeff,
__m256i *accum_out_r0r1,
__m256i *accum_out_r2r3,
int block_row_begin,
int block_col_begin) {
// Load source data
const int src_index_start = block_row_begin * stride + block_col_begin;
const uint16_t *src_ptr = dgd + src_index_start - 2 * stride - 2;
// Load source data required for chroma filtering into registers.
LOAD_SRC_DATA_FOR_CHROMA_FILTERING(src_ptr, stride)
// Forms 256bit source registers from loaded 128bit registers and pack them
// accordingly for filtering process.
FORM_AND_PACK_REG_FOR_CHROMA_FILTERING()
// Derive registers to hold center pixel
// c2 c2 c3 c3 c4 c4 c5 c5 | d2 d2 d3 d3 d4 d4 d5 d5
const __m256i cp0 = _mm256_bslli_epi128(src_cd, 4);
const __m256i center_pixel_row01_0 = _mm256_unpackhi_epi16(cp0, cp0);
// e2 e2 e3 e3 e4 e4 e5 e5 | f2 f2 f3 f3 f4 f4 f5 f5
const __m256i cp1 = _mm256_bslli_epi128(src_ef, 4);
const __m256i center_pixel_row23_0 = _mm256_unpackhi_epi16(cp1, cp1);
const __m256i zero = _mm256_set1_epi16(0x0);
// 0 c2 0 c3 0 c4 0 c5 | 0 d2 0 d3 0 d4 0 d5
const __m256i center_pixel_row01_1 = _mm256_unpackhi_epi16(zero, cp0);
// 0 e2 0 e3 0 e4 0 e5 | 0 f2 0 f3 0 f4 0 f5
const __m256i center_pixel_row23_1 = _mm256_unpackhi_epi16(zero, cp1);
// Output corresponding to filter coefficient f4.
// a2 e2 a3 e3 a4 e4 a5 e5 | b2 f2 b3 f3 b4 f4 b5 f5
const __m256i ru8_0 = _mm256_alignr_epi8(ru1, ru0, 8);
const __m256i ru8 = _mm256_sub_epi16(ru8_0, center_pixel_row01_0);
// c2 g2 c3 g3 c4 g4 c5 g5 | d2 h2 d3 h3 d4 h4 d5 h5
const __m256i ru9_0 = _mm256_alignr_epi8(ru3, ru2, 8);
const __m256i ru9 = _mm256_sub_epi16(ru9_0, center_pixel_row23_0);
// f4 f4 f4 f4 f4 f4 f4 f4 f4 f4 f4 f4 f4 f4 f4 f4 f4 f4
const __m256i fc4 =
_mm256_broadcastd_epi32(_mm_unpackhi_epi16(filt_coeff, filt_coeff));
// r00 r01 r02 r03 | r10 r11 r12 r13
const __m256i out_f4_r0r1 = _mm256_madd_epi16(ru8, fc4);
*accum_out_r0r1 = _mm256_add_epi32(out_f4_r0r1, *accum_out_r0r1);
// r20 r21 r22 r23 | r30 r31 r32 r33
const __m256i out_f4_r2r3 = _mm256_madd_epi16(ru9, fc4);
*accum_out_r2r3 = _mm256_add_epi32(out_f4_r2r3, *accum_out_r2r3);
// Output corresponding to filter coefficient 2, 0, 3.
// b1 d1 b2 d2 b3 d3 b4 d4 | c1 e1 c2 e2 c3 e3 c4 e4
const __m256i ru10_0 = _mm256_alignr_epi8(ru5, ru4, 4);
__m256i ru10 = _mm256_sub_epi16(ru10_0, center_pixel_row01_0);
// d1 f1 d2 f2 d3 f3 d4 f4 | e1 g1 e2 g2 e3 g3 e4 g4
const __m256i ru11_0 = _mm256_alignr_epi8(ru7, ru6, 4);
__m256i ru11 = _mm256_sub_epi16(ru11_0, center_pixel_row23_0);
// b2 d2 b3 d3 b4 d4 b5 d5 | c2 e2 c3 e3 c4 e4 c5 e5
const __m256i ru12_0 = _mm256_alignr_epi8(ru5, ru4, 8);
__m256i ru12 = _mm256_sub_epi16(ru12_0, center_pixel_row01_0);
// d2 f2 d3 f3 d4 f4 d5 f5 | e2 g2 e3 g3 e4 g4 e5 g5
const __m256i ru13_0 = _mm256_alignr_epi8(ru7, ru6, 8);
__m256i ru13 = _mm256_sub_epi16(ru13_0, center_pixel_row23_0);
// b3 d3 b4 d4 b5 d5 b6 d6 | c3 e3 c4 e4 c5 e5 c6 e6
const __m256i ru14_0 = _mm256_alignr_epi8(ru5, ru4, 12);
__m256i ru14 = _mm256_sub_epi16(ru14_0, center_pixel_row01_0);
// d3 f3 d4 f4 d5 f5 d6 f6 | e3 g3 e4 g4 e5 g5 e6 g6
const __m256i ru15_0 = _mm256_alignr_epi8(ru7, ru6, 12);
__m256i ru15 = _mm256_sub_epi16(ru15_0, center_pixel_row23_0);
// f2 f3 f2 f3 - - - -
const __m256i fc23 =
_mm256_broadcastd_epi32(_mm_shufflelo_epi16(filt_coeff, 0x0E));
// f0 f0 f0 f0 - - - -
const __m256i fc00 = _mm256_broadcastw_epi16(filt_coeff);
// f3 f2 f3 f2 - - - -
const __m256i fc32 =
_mm256_broadcastd_epi32(_mm_shufflelo_epi16(filt_coeff, 0x0B));
// Multiply the formed registers with filter and add the result to accumulate
// registers.
MADD_AND_ACCUM_FOR_CHROMA_FILTERING(fc23, fc00, fc32)
// Output corresponding to filter coefficient 5, 1, 6.
// c0 c1 c1 c2 c2 c3 c3 c4 || d0 d1 d1 d2 d2 d3 d3 d4
ru10 = _mm256_unpacklo_epi16(src_cd, _mm256_bsrli_epi128(src_cd, 2));
ru10 = _mm256_sub_epi16(ru10, center_pixel_row01_0);
// e0 e1 e1 e2 e2 e3 e3 e4 || f0 f1 f1 f2 f2 f3 f3 f4
ru11 = _mm256_unpacklo_epi16(src_ef, _mm256_bsrli_epi128(src_ef, 2));
ru11 = _mm256_sub_epi16(ru11, center_pixel_row23_0);
// c2 c3 c3 c4 c4 c5 c5 c6 || d2 d3 d3 d4 d4 d5 d5 d6
ru12 = _mm256_unpacklo_epi16(_mm256_bsrli_epi128(src_cd, 4),
_mm256_bsrli_epi128(src_cd, 6));
ru12 = _mm256_sub_epi16(ru12, center_pixel_row01_1);
// e2 e3 e3 e4 e4 e5 e5 e6 || f2 f3 f3 f4 f4 f5 f5 f6
ru13 = _mm256_unpacklo_epi16(_mm256_bsrli_epi128(src_ef, 4),
_mm256_bsrli_epi128(src_ef, 6));
ru13 = _mm256_sub_epi16(ru13, center_pixel_row23_1);
// c4 0 c5 0 c6 0 c7 0 || d4 0 d5 0 d6 0 d7 0
ru14 = _mm256_unpackhi_epi16(src_cd, zero);
ru14 = _mm256_sub_epi16(ru14, center_pixel_row01_0);
// e4 0 e5 0 e6 0 e7 0 || f4 0 f5 0 f6 0 f7 0
ru15 = _mm256_unpackhi_epi16(src_ef, zero);
ru15 = _mm256_sub_epi16(ru15, center_pixel_row23_0);
// f5 f1 f5 f1 - - - -
const __m128i filt51 =
_mm_blend_epi16(filt_coeff, _mm_bsrli_si128(filt_coeff, 4), 0x08);
const __m256i fc51 =
_mm256_broadcastd_epi32(_mm_shufflelo_epi16(filt51, 0x07));
// f6 f1 f6 f1 - - - -
const __m128i filt61 =
_mm_blend_epi16(filt_coeff, _mm_bsrli_si128(filt_coeff, 6), 0x08);
const __m256i fc61 =
_mm256_broadcastd_epi32(_mm_shufflelo_epi16(filt61, 0x07));
// f5 0 f5 0 f5 0 f5 0 - -
const __m256i fc5z = _mm256_blend_epi16(fc51, zero, 0xAA);
// Multiply the formed registers with filter and add the result to accumulate
// registers.
MADD_AND_ACCUM_FOR_CHROMA_FILTERING(fc51, fc61, fc5z)
}
// AVX2 intrinsic for convolve wiener non-separable loop restoration with 6-tap
// filtering. The output for a particular pixel in a 4x4 block is calculated
// with DIAMOND shaped filter considering a 5x5 grid surrounded by that pixel.
// Filter Coefficients: fc0 fc1 fc2 fc3 fc4 fc5 x x
// Load Source Data:
// src_ra0 = a0 a1 a2 a3 a4 a5 a6 a7
// src_rb0 = b0 b1 b2 b3 b4 b5 b6 b7
// src_rc0 = c0 c1 c2 c3 c4 c5 c6 c7
// src_rd0 = d0 d1 d2 d3 d4 d5 d6 d7
// src_re0 = e0 e1 e2 e3 e4 e5 e6 e7
// src_rf0 = f0 f1 f2 f3 f4 f5 f6 f7
// src_rg0 = g0 g1 g2 g3 g4 g5 g6 g7
// The output for a pixel located at c2 position is calculated as below.
// Filtered_c2 = ((a2-c2)+(e2-c2))*fc4 + ((b1-c2+d3-c2))*fc2 +
// (b2-c2+d2-c2)*fc0 + (b3-c2+d1-c2)*fc3 + (c0-c2+c4-c2)*fc5 +
// (c1-c2+c3-c2)*fc1 + c2*singleton_tap + dc_offset
// The source registers are unpacked such that the output corresponding to 2
// rows will be produced in a single register (i.e., processing 2 rows
// simultaneously).
//
// Example:
// The output corresponding to fc4 of rows 0 and 1 is achieved like below.
// __m256i centerpixel_row01 = c2 c2 c3 c3 c4 c4 c5 c5 | d2 d2 d3 d3 d4 d4 d5 d5
// __m256i src_reg3 = a2 e2 a3 e3 a4 e4 a5 e5 | b2 f2 b3 f3 b4 f4 b5 f5
// __m256i filter_4 = fc4 fc4 fc4 fc4 fc4 fc4 fc4 fc4 | fc4 fc4 fc4 fc4 fc4 fc4
// fc4 fc4
// __m256 src_reg3 = _mm256_sub_epi16(src_reg3, centerpixel_row01);
// __m256i out_f4_01 = _mm256_madd_epi16(src_reg3, filter_4);
// = ((a2-c2)*fc4+(e2-c2)*fc4) (a3-c3)*fc4+(e3-c3)*fc4) .. |
// (b2-d2)*fc4 +(f2-d2)*fc4) . .
// Here, out_f4_01 contains partial output of rows 0 and 1 corresponding to fc4.
void av1_convolve_symmetric_subtract_center_highbd_6tap_avx2(
const uint16_t *dgd, int stride, const NonsepFilterConfig *filter_config,
const int16_t *filter, uint16_t *dst, int dst_stride, int bit_depth,
int block_row_begin, int block_col_begin) {
// Derive singleton_tap and dc_offset.
const int32_t singleton_tap = 1 << filter_config->prec_bits;
int32_t dc_offset = 0;
if (filter_config->num_pixels % 2) {
const int dc_offset_tap_index =
filter_config->config[filter_config->num_pixels - 1][NONSEP_BUF_POS];
dc_offset = filter[dc_offset_tap_index];
}
// Load filter tap values.
// fc0 fc1 fc2 fc3 fc4 fc5 center_tap x
const __m128i filt_coeff_0 = _mm_loadu_si128((__m128i const *)(filter));
// Replace the center_tap with derived singleton_tap.
const __m128i center_tap = _mm_set1_epi16(singleton_tap);
const __m128i filt_coeff = _mm_blend_epi16(filt_coeff_0, center_tap, 0x40);
// Initializing the output registers with zero
__m256i accum_out_r0r1 = _mm256_setzero_si256();
__m256i accum_out_r2r3 = _mm256_setzero_si256();
// Perform 6-tap filtering on source buffer
apply_6tap_filtering(dgd, stride, filt_coeff, &accum_out_r0r1,
&accum_out_r2r3, block_row_begin, block_col_begin);
// Offset addition
const __m128i offset_reg = _mm_set1_epi32(dc_offset);
const __m256i ofs = _mm256_inserti128_si256(
_mm256_castsi128_si256(offset_reg), offset_reg, 1);
accum_out_r0r1 = _mm256_add_epi32(accum_out_r0r1, ofs);
accum_out_r2r3 = _mm256_add_epi32(accum_out_r2r3, ofs);
// Store the output after rounding and clipping
round_and_store_avx2(dst, dst_stride, filter_config, bit_depth,
accum_out_r0r1, accum_out_r2r3, block_row_begin,
block_col_begin);
}
// TODO(Arun Negi): Difference of source and center pixel needs to go through
// clip_base(). Implement clip_base() in intrinsic once the support is added.
//
// AVX2 intrinsic for convolve wiener non-separable loop restoration with
// 12/13-tap filtering. The output for a particular pixel in a 4x4 block is
// calculated with DIAMOND shaped filter considering a 7x7 grid surrounded by
// that pixel.
// The following describes the design considered for intrinsic
// implementation.
// Filter Coefficients: fc0 fc1 fc2 fc3 fc4 fc5 fc6 fc7 fc8 fc9 fc10 fc11 fc12
// Load Source Data:
// src_ra0 = a0 a1 a2 a3 a4 a5 a6 a7
// src_rb0 = b0 b1 b2 b3 b4 b5 b6 b7
// src_rc0 = c0 c1 c2 c3 c4 c5 c6 c7
// src_rd0 = d0 d1 d2 d3 d4 d5 d6 d7
// src_re0 = e0 e1 e2 e3 e4 e5 e6 e7
// src_rf0 = f0 f1 f2 f3 f4 f5 f6 f7
// src_rg0 = g0 g1 g2 g3 g4 g5 g6 g7
// The output for a pixel located at d3 position is calculated as below.
// Filtered_d3 = ((a3-d3)+(g3-d3))*fc10 +
// ((b2-d3+f4-d3))*fc6 + (b3-d3+f3-d3)*fc2 + (b4-d3+f5-d3)*fc7 +
// (c1-d3+e5-d3)*fc8 + (c2-d3+e4)*fc4 + (c3-d3+e3-d3)*fc0 + (c4-d3+e2-d3)*fc5 +
// (c4-d3+e2-d3)*fc5 + (c5-d3+e1-d3)*fc9 + (d0-d3+d6-d3)*fc11 +
// (d1-d3+d5-d3)*fc03 + (d2-d3+d4-d3)*fc01 + d3*f12 + dc_offset
// The source registers are unpacked such that the output corresponding to 2
// rows will be produced in a single register (i.e., processing 2 rows
// simultaneously).
//
// Example:
// The output corresponding to fc10 of rows 0 and 1 is achieved like below.
// __m256i centerpixel_row01 = d3 d3 d4 d4 d5 d5 d6 d6 | e3 e3 e4 e4 e5 e5 e6 e6
// __m256i src_reg3 = a3 g3 a4 g4 a5 g5 a6 g6 | b3 h3 b4 h4 b5 h5 b6 h6
// __m256i filter_10 = f10 f10 f10 f10 f10 f10 f10 f10 | f10 f10 f10 f10 f10 f10
// f10 f10
// __m256 src_reg3 = _mm256_sub_epi16(src_reg3, centerpixel_row01);
// __m256i out_f10_01 = _mm256_madd_epi16(src_reg3, filter_10);
// = ((a3-d3)*f10+(g3-d3)*f10) (a4-d4)*f10+(g4-d4)*f10) .. |
// (b3-e3)*f10+(h3-e3)*f10) . .
// Here, out_f10_01 contains partial output of rows 0 and 1 corresponding to
// fc10.
void av1_convolve_symmetric_subtract_center_highbd_12tap_avx2(
const uint16_t *dgd, int stride, const NonsepFilterConfig *filter_config,
const int16_t *filter, uint16_t *dst, int dst_stride, int bit_depth,
int block_row_begin, int block_col_begin) {
// Derive singleton_tap and dc_offset.
const int32_t singleton_tap = 1 << filter_config->prec_bits;
int32_t dc_offset = 0;
if (filter_config->num_pixels % 2) {
const int dc_offset_tap_index =
filter_config->config[filter_config->num_pixels - 1][NONSEP_BUF_POS];
dc_offset = filter[dc_offset_tap_index];
}
// Load source data.
int src_index_start = block_row_begin * stride + block_col_begin;
const uint16_t *src_ptr = dgd + src_index_start - 3 * stride - 3;
const __m128i src_a0 = _mm_loadu_si128((__m128i const *)src_ptr);
const __m128i src_b0 = _mm_loadu_si128((__m128i const *)(src_ptr + stride));
const __m128i src_c0 =
_mm_loadu_si128((__m128i const *)(src_ptr + 2 * stride));
const __m128i src_d0 =
_mm_loadu_si128((__m128i const *)(src_ptr + 3 * stride));
const __m128i src_e0 =
_mm_loadu_si128((__m128i const *)(src_ptr + 4 * stride));
const __m128i src_f0 =
_mm_loadu_si128((__m128i const *)(src_ptr + 5 * stride));
const __m128i src_g0 =
_mm_loadu_si128((__m128i const *)(src_ptr + 6 * stride));
const __m128i src_h0 =
_mm_loadu_si128((__m128i const *)(src_ptr + 7 * stride));
const __m128i src_i0 =
_mm_loadu_si128((__m128i const *)(src_ptr + 8 * stride));
const __m128i src_j0 =
_mm_loadu_si128((__m128i const *)(src_ptr + 9 * stride));
// Load filter tap values.
// fc0 fc1 fc2 fc3 fc4 fc5 fc6 fc7
const __m128i filt_coeff_0 = _mm_loadu_si128((__m128i const *)(filter));
// fc5 fc6 fc7 fc8 fc9 fc10 fc11 fc12
const __m128i filta = _mm_loadu_si128((__m128i const *)(filter + 5));
// Replace the fc12 with derived singleton_tap.
const __m128i center_tap = _mm_set1_epi16(singleton_tap);
const __m128i filt_coeff_1 = _mm_blend_epi16(filta, center_tap, 0x80);
// Form 256bit source registers.
// a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 b7
const __m256i src_ab =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_a0), src_b0, 1);
// c0 c1 c2 c3 c4 c5 c6 c7 | d0 d1 d2 d3 d4 d5 d6 d7
const __m256i src_cd =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_c0), src_d0, 1);
// e0 e1 e2 e3 e4 e5 e6 e7 | f0 f1 f2 f3 f4 f5 f6 f7
const __m256i src_ef =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_e0), src_f0, 1);
// g0 g1 g2 g3 g4 g5 g6 g7 | h0 h1 h2 h3 h4 h5 h6 h7
const __m256i src_gh =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_g0), src_h0, 1);
// i0 i1 i2 i3 i4 i5 i6 i7 | j0 j1 j2 j3 j4 j5 j6 j7
const __m256i src_ij =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_i0), src_j0, 1);
// Derive registers to hold center pixel
const __m128i center_pixel_d = _mm_bslli_si128(src_d0, 2);
const __m128i cp_d_high = _mm_unpackhi_epi16(center_pixel_d, center_pixel_d);
const __m128i center_pixel_e = _mm_bslli_si128(src_e0, 2);
const __m128i cp_e_high = _mm_unpackhi_epi16(center_pixel_e, center_pixel_e);
// d3 d3 d4 d4 d5 d5 d6 d6 | e3 e3 e4 e4 e5 e5 e6 e6
const __m256i center_pixel_row01_0 =
_mm256_inserti128_si256(_mm256_castsi128_si256(cp_d_high), cp_e_high, 1);
const __m128i center_pixel_f = _mm_bslli_si128(src_f0, 2);
const __m128i cp_f_high = _mm_unpackhi_epi16(center_pixel_f, center_pixel_f);
const __m128i center_pixel_g = _mm_bslli_si128(src_g0, 2);
const __m128i cp_g_high = _mm_unpackhi_epi16(center_pixel_g, center_pixel_g);
// f3 f3 f4 f4 f5 f5 f6 f6 | g3 g3 g4 g4 g5 g5 g6 g6
const __m256i center_pixel_row23_0 =
_mm256_inserti128_si256(_mm256_castsi128_si256(cp_f_high), cp_g_high, 1);
const __m128i zero = _mm_set1_epi16(0x0);
const __m128i cp_d0_high = _mm_unpackhi_epi16(center_pixel_d, zero);
const __m128i cp_e0_high = _mm_unpackhi_epi16(center_pixel_e, zero);
// d3 0 d4 0 d5 0 d6 0 | e3 0 e4 0 e5 0 e6 0
const __m256i center_pixel_row01_1 = _mm256_inserti128_si256(
_mm256_castsi128_si256(cp_d0_high), cp_e0_high, 1);
const __m128i cp_f0_high = _mm_unpackhi_epi16(center_pixel_f, zero);
const __m128i cp_g0_high = _mm_unpackhi_epi16(center_pixel_g, zero);
// f3 0 f4 0 f5 0 f6 0 | g3 0 g4 0 g5 0 g6 0
const __m256i center_pixel_row23_1 = _mm256_inserti128_si256(
_mm256_castsi128_si256(cp_f0_high), cp_g0_high, 1);
// Packing the source rows.
// a0 g0 a1 g1 a2 g2 a3 g3 | b0 h0 b1 h1 b2 h2 b3 h3
const __m256i ru0 = _mm256_unpacklo_epi16(src_ab, src_gh);
// a4 g4 a5 g5 a6 g6 a7 g7 | b4 h4 b5 h5 b6 h6 b7 h7
const __m256i ru1 = _mm256_unpackhi_epi16(src_ab, src_gh);
// c0 e0 c1 e1 c2 e2 c3 e3 | d0 f0 d1 f1 d2 f2 d3 f3
const __m256i ru2 = _mm256_unpacklo_epi16(src_cd, src_ef);
// c4 e4 c5 e5 c6 e6 c7 e7 | d4 f4 d5 f5 d6 f6 d7 f7
__m256i ru3 = _mm256_unpackhi_epi16(src_cd, src_ef);
// c0 i0 c1 i1 c2 i2 c3 i3 | d0 j0 d1 j1 d2 j2 d3 j3
const __m256i ru4 = _mm256_unpacklo_epi16(src_cd, src_ij);
// c4 i4 c5 i5 c6 i6 c7 i7 | d4 j4 d5 j5 d6 j6 d7 j7
const __m256i ru5 = _mm256_unpackhi_epi16(src_cd, src_ij);
// e0 g0 e1 g1 e2 g2 e3 g3 | f0 h0 f1 h1 f2 h2 f3 h3
const __m256i ru6 = _mm256_unpacklo_epi16(src_ef, src_gh);
// e4 g4 e5 g5 e6 g6 e7 g7 | f4 h4 f5 h5 f6 h6 f7 h7
__m256i ru7 = _mm256_unpackhi_epi16(src_ef, src_gh);
// Output corresponding to filter coefficient 10.
// a3 g3 a4 g4 a5 g5 a6 g6 | b3 h3 b4 h4 b5 h5 b6 h6
const __m256i ru8_0 = _mm256_alignr_epi8(ru1, ru0, 12);
const __m256i ru8 = _mm256_sub_epi16(ru8_0, center_pixel_row01_0);
// c3 i3 c4 i4 c5 i5 c6 i6 | d3 j3 d4 j4 d5 j5 d6 j6
const __m256i ru9_0 = _mm256_alignr_epi8(ru5, ru4, 12);
const __m256i ru9 = _mm256_sub_epi16(ru9_0, center_pixel_row23_0);
// f10 f10 f10 f10 f10 f10 f10 f10 f10 f10 f10 f10 f10 f10 f10 f10
const __m128i fil10 = _mm_bsrli_si128(filt_coeff_1, 4);
const __m256i fc10 =
_mm256_broadcastd_epi32(_mm_shufflelo_epi16(fil10, 0x0F));
// r00 r01 r02 r03 | r10 r11 r12 r13
__m256i accum_out_r0r1 = _mm256_madd_epi16(ru8, fc10);
// r20 r21 r22 r23 | r30 r31 r32 r33
__m256i accum_out_r2r3 = _mm256_madd_epi16(ru9, fc10);
// Output corresponding to filter coefficients 8,4,0,5.
// c1 e1 c2 e2 c3 e3 c4 e4 | d1 f1 d2 f2 d3 f3 d4 f4
const __m256i ru10_0 = _mm256_alignr_epi8(ru3, ru2, 4);
const __m256i ru10 = _mm256_sub_epi16(ru10_0, center_pixel_row01_0);
// e1 g1 e2 g2 e3 g3 e4 g4 | f1 h1 f2 h2 f3 h3 f4 h4
const __m256i ru11_0 = _mm256_alignr_epi8(ru7, ru6, 4);
const __m256i ru11 = _mm256_sub_epi16(ru11_0, center_pixel_row23_0);
// c2 e2 c3 e3 c4 e4 c5 e5 | d2 f2 d3 f3 d4 f4 d5 f5
const __m256i ru12_0 = _mm256_alignr_epi8(ru3, ru2, 8);
const __m256i ru12 = _mm256_sub_epi16(ru12_0, center_pixel_row01_0);
// e2 g2 e3 g3 e4 g4 e5 g5 | f2 h2 f3 h3 f4 h4 f5 h5
const __m256i ru13_0 = _mm256_alignr_epi8(ru7, ru6, 8);
const __m256i ru13 = _mm256_sub_epi16(ru13_0, center_pixel_row23_0);
// c3 e3 c4 e4 c5 e5 c6 e6 | d3 f3 d4 f4 d5 f5 d6 f6
const __m256i ru14_0 = _mm256_alignr_epi8(ru3, ru2, 12);
const __m256i ru14 = _mm256_sub_epi16(ru14_0, center_pixel_row01_0);
// e3 g3 e4 g4 e5 g5 e6 g6 | f3 h3 f4 h4 f5 h5 f6 h6
const __m256i ru15_0 = _mm256_alignr_epi8(ru7, ru6, 12);
const __m256i ru15 = _mm256_sub_epi16(ru15_0, center_pixel_row23_0);
// fc6 fc7 fc8 fc9 fc10 fc11 fc12 0
const __m128i filt89 = _mm_bsrli_si128(filt_coeff_1, 6);
const __m256i fc89 = _mm256_broadcastd_epi32(filt89);
const __m128i filt45 = _mm_bsrli_si128(filt_coeff_0, 8);
const __m256i fc45 = _mm256_broadcastd_epi32(filt45);
const __m256i fc00 = _mm256_broadcastw_epi16(filt_coeff_0);
const __m128i filt54 = _mm_bsrli_si128(filt_coeff_0, 4);
const __m256i fc54 =
_mm256_broadcastd_epi32(_mm_shufflelo_epi16(filt54, 0x0B));
ru3 = _mm256_sub_epi16(ru3, center_pixel_row01_0);
ru7 = _mm256_sub_epi16(ru7, center_pixel_row23_0);
const __m256i res_0 = _mm256_madd_epi16(ru10, fc89);
const __m256i res_1 = _mm256_madd_epi16(ru11, fc89);
const __m256i res_2 = _mm256_madd_epi16(ru12, fc45);
const __m256i res_3 = _mm256_madd_epi16(ru13, fc45);
const __m256i res_4 = _mm256_madd_epi16(ru14, fc00);
const __m256i res_5 = _mm256_madd_epi16(ru15, fc00);
const __m256i res_6 = _mm256_madd_epi16(ru3, fc54);
const __m256i res_7 = _mm256_madd_epi16(ru7, fc54);
// r00 r01 r02 r03 | r10 r11 r12 r13
const __m256i out_0 = _mm256_add_epi32(res_0, res_2);
const __m256i out_1 = _mm256_add_epi32(res_4, res_6);
const __m256i out_2 = _mm256_add_epi32(out_0, out_1);
accum_out_r0r1 = _mm256_add_epi32(accum_out_r0r1, out_2);
// r20 r21 r22 r23 | r30 r31 r32 r33
const __m256i out_3 = _mm256_add_epi32(res_1, res_3);
const __m256i out_4 = _mm256_add_epi32(res_5, res_7);
const __m256i out_5 = _mm256_add_epi32(out_3, out_4);
accum_out_r2r3 = _mm256_add_epi32(accum_out_r2r3, out_5);
// Output corresponding to filter coefficients 11,3,1,12.
// d2 d3 d4 d5 d6 d7 d8 d9
const __m128i src_d2 =
_mm_loadu_si128((__m128i const *)(src_ptr + 3 * stride + 2));
// e2 e3 e4 e5 e6 e7 e8 e9
const __m128i src_e2 =
_mm_loadu_si128((__m128i const *)(src_ptr + 4 * stride + 2));
// f2 f3 f4 f5 f6 f7 f8 f9
const __m128i src_f2 =
_mm_loadu_si128((__m128i const *)(src_ptr + 5 * stride + 2));
// g2 g3 g4 g5 g6 g7 g8 g9
const __m128i src_g2 =
_mm_loadu_si128((__m128i const *)(src_ptr + 6 * stride + 2));
// d0 d1 d2 d3 d4 d5 d6 d7 | e0 e1 e2 e3 e4 e5 e6 e7
const __m256i rm0 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_d0), src_e0, 1);
// d2 d3 d4 d5 d6 d7 d8 d9 | e2 e3 e4 e5 e6 e7 e8 e9
const __m256i rm2 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_d2), src_e2, 1);
// f0 f1 f2 f3 f4 f5 f6 f7 | g0 g1 g2 g3 g4 g5 g6 g7
const __m256i rm00 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_f0), src_g0, 1);
// f2 f3 f4 f5 f6 f7 f8 f9 | g2 g3 g4 g5 g6 g7 g8 g9
const __m256i rm22 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_f2), src_g2, 1);
// d1 d2 d3 d4 d5 d6 d7 d8 | e1 e2 e3 e4 e5 e6 e7 e8
const __m256i rm1 = _mm256_alignr_epi8(_mm256_bsrli_epi128(rm2, 12), rm0, 2);
const __m256i rm11 =
_mm256_alignr_epi8(_mm256_bsrli_epi128(rm22, 12), rm00, 2);
// d3 d4 d5 d6 d7 d8 d9 0 | e3 e4 e5 e6 e7 e8 e9 0
const __m256i rm3 = _mm256_bsrli_epi128(rm2, 2);
const __m256i rm33 = _mm256_bsrli_epi128(rm22, 2);
// d0 d1 d1 d2 d2 d3 d3 d4 | e0 e1 e1 e2 e2 e3 e3 e4
__m256i rm4 = _mm256_unpacklo_epi16(rm0, rm1);
rm4 = _mm256_sub_epi16(rm4, center_pixel_row01_0);
// d2 d3 d3 d4 d4 d5 d5 d6 | e2 e3 e3 e4 e4 e5 e5 e6
__m256i rm5 = _mm256_unpacklo_epi16(rm2, rm3);
rm5 = _mm256_sub_epi16(rm5, center_pixel_row01_1);
// d4 d5 d5 d6 d6 d7 d7 d8 | e4 e5 e5 e6 e6 e7 e7 e8
__m256i rm6 = _mm256_unpackhi_epi16(rm0, rm1);
rm6 = _mm256_sub_epi16(rm6, center_pixel_row01_0);
// d6 0 d7 0 d8 0 d9 0 | e6 0 e7 0 e8 0 e9 0
__m256i rm7 = _mm256_unpackhi_epi16(rm2, _mm256_set1_epi16(0));
rm7 = _mm256_sub_epi16(rm7, center_pixel_row01_0);
__m256i rm44 = _mm256_unpacklo_epi16(rm00, rm11);
rm44 = _mm256_sub_epi16(rm44, center_pixel_row23_0);
__m256i rm55 = _mm256_unpacklo_epi16(rm22, rm33);
rm55 = _mm256_sub_epi16(rm55, center_pixel_row23_1);
__m256i rm66 = _mm256_unpackhi_epi16(rm00, rm11);
rm66 = _mm256_sub_epi16(rm66, center_pixel_row23_0);
__m256i rm77 = _mm256_unpackhi_epi16(rm22, _mm256_set1_epi16(0));
rm77 = _mm256_sub_epi16(rm77, center_pixel_row23_0);
// fc11 fc03 fc11 fc03 - -
const __m128i fc11_fc03 =
_mm_blend_epi16(_mm_bsrli_si128(filt_coeff_1, 8), filt_coeff_0, 0x08);
const __m256i filt1103 =
_mm256_broadcastd_epi32(_mm_shufflelo_epi16(fc11_fc03, 0x0E));
// fc01 fc12 fc01 fc12 - -
const __m128i fc01_fc12 =
_mm_blend_epi16(_mm_bsrli_si128(filt_coeff_1, 8), filt_coeff_0, 0x02);
const __m256i filt0112 =
_mm256_broadcastd_epi32(_mm_shufflelo_epi16(fc01_fc12, 0x0D));
// fc01 fc03 fc01 fc03 - -
const __m256i fc0103 =
_mm256_broadcastd_epi32(_mm_shufflelo_epi16(filt_coeff_0, 0x0D));
// f11 0 f11 0 f11 0 f11 0 - -
const __m256i fc11z = _mm256_broadcastd_epi32(_mm_shufflelo_epi16(
_mm_bsrli_si128(_mm_unpackhi_epi16(filt_coeff_1, zero), 4), 0x0E));
rm4 = _mm256_madd_epi16(rm4, filt1103);
rm5 = _mm256_madd_epi16(rm5, filt0112);
rm6 = _mm256_madd_epi16(rm6, fc0103);
rm7 = _mm256_madd_epi16(rm7, fc11z);
rm44 = _mm256_madd_epi16(rm44, filt1103);
rm55 = _mm256_madd_epi16(rm55, filt0112);
rm66 = _mm256_madd_epi16(rm66, fc0103);
rm77 = _mm256_madd_epi16(rm77, fc11z);
// r00 r01 r02 r03 | r10 r11 r12 r13
rm4 = _mm256_add_epi32(rm4, rm5);
rm6 = _mm256_add_epi32(rm6, rm7);
rm4 = _mm256_add_epi32(rm4, rm6);
accum_out_r0r1 = _mm256_add_epi32(accum_out_r0r1, rm4);
// r20 r21 r22 r23 | r30 r31 r32 r33
rm44 = _mm256_add_epi32(rm44, rm55);
rm66 = _mm256_add_epi32(rm66, rm77);
rm44 = _mm256_add_epi32(rm44, rm66);
accum_out_r2r3 = _mm256_add_epi32(accum_out_r2r3, rm44);
// Output corresponding to filter coefficients 6,2,7,9.
const __m128i src_b2 =
_mm_loadu_si128((__m128i const *)(src_ptr + 1 * stride + 2));
const __m128i src_c2 =
_mm_loadu_si128((__m128i const *)(src_ptr + 2 * stride + 2));
const __m256i rn0 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_b2), src_c2, 1);
// b2 b3 b3 b4 b4 b5 b5 b6 | c2 c3 c3 c4 c4 c5 c5 c6
__m256i r0 = _mm256_unpacklo_epi16(rn0, _mm256_bsrli_epi128(rn0, 2));
r0 = _mm256_sub_epi16(r0, center_pixel_row01_0);
const __m256i rcd2 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_c2), src_d2, 1);
// b4 c5 b5 c6 b6 c7 b7 c8 | c4 d5 c5 d6 c6 d7 c7 d8
__m256i r1 = _mm256_unpacklo_epi16(_mm256_bsrli_epi128(rn0, 4),
_mm256_bsrli_epi128(rcd2, 6));
r1 = _mm256_sub_epi16(r1, center_pixel_row01_0);
const __m256i rfg2 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_f2), src_g2, 1);
// f2 f3 f3 f4 f4 f5 f5 f6 | g2 g3 g3 g4 g4 g5 g5 g6
__m256i r2 = _mm256_unpacklo_epi16(rfg2, _mm256_bsrli_epi128(rfg2, 2));
r2 = _mm256_sub_epi16(r2, center_pixel_row01_0);
const __m256i ref2 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_e2), src_f2, 1);
// f4 e5 f5 e6 f6 e7 f7 e8 | g4 f5 g5 f6 g6 f7 g7 f8
__m256i r3 = _mm256_unpacklo_epi16(_mm256_bsrli_epi128(rfg2, 4),
_mm256_bsrli_epi128(ref2, 6));
r3 = _mm256_sub_epi16(r3, center_pixel_row01_0);
const __m128i filt62 = _mm_blend_epi16(filt_coeff_1, filt_coeff_0, 0x04);
// fc6 fc2 - - - - - - | fc6 fc2 - - - - - -
const __m256i fc62 =
_mm256_broadcastd_epi32(_mm_shufflelo_epi16(filt62, 0x09));
// fc7 fc9 - - - - - - | fc7 fc9 - - - - - -
const __m256i fc79 = _mm256_broadcastd_epi32(
_mm_shufflelo_epi16(_mm_bsrli_si128(filt_coeff_1, 2), 0x0D));
// fc7 fc2 - - - - - - | fc7 fc2 - - - - - -
const __m128i filt72 =
_mm_blend_epi16(_mm_bsrli_si128(filt_coeff_1, 2), filt_coeff_0, 0x04);
const __m256i fc72 =
_mm256_broadcastd_epi32(_mm_shufflelo_epi16(filt72, 0x09));
// fc6 fc8 - - - - - - | fc6 fc8 - - - - - -
const __m256i fc68 =
_mm256_broadcastd_epi32(_mm_shufflelo_epi16(filt_coeff_1, 0x0D));
r0 = _mm256_madd_epi16(r0, fc62);
r1 = _mm256_madd_epi16(r1, fc79);
r2 = _mm256_madd_epi16(r2, fc72);
r3 = _mm256_madd_epi16(r3, fc68);
// r00 r01 r02 r03 | r10 r11 r12 r13
r0 = _mm256_add_epi32(r0, r1);
r2 = _mm256_add_epi32(r2, r3);
r0 = _mm256_add_epi32(r0, r2);
accum_out_r0r1 = _mm256_add_epi32(accum_out_r0r1, r0);
const __m256i rn1 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_d2), src_e2, 1);
// d2 d3 d3 d4 d4 d5 d5 d6 | e2 e3 e3 e4 e4 e5 e5 e6
__m256i r00 = _mm256_unpacklo_epi16(rn1, _mm256_bsrli_epi128(rn1, 2));
r00 = _mm256_sub_epi16(r00, center_pixel_row23_0);
// d4 e5 d5 e6 d6 e7 d7 e8 | e4 f5 e5 f6 e6 f7 e7 f8
__m256i r11 = _mm256_unpacklo_epi16(_mm256_bsrli_epi128(rn1, 4),
_mm256_bsrli_epi128(ref2, 6));
r11 = _mm256_sub_epi16(r11, center_pixel_row23_0);
const __m128i src_h2 =
_mm_loadu_si128((__m128i const *)(src_ptr + 7 * stride + 2));
const __m128i src_i2 =
_mm_loadu_si128((__m128i const *)(src_ptr + 8 * stride + 2));
// h2 h3 h4 h5 h6 h7 h8 h9 | i2 i3 i4 i5 i6 i7 i8 i9
__m256i rhi2 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_h2), src_i2, 1);
// h2 h3 h3 h4 h4 h5 h5 h6 | i2 i3 i3 i4 i4 i5 i5 i6
__m256i r22 = _mm256_unpacklo_epi16(rhi2, _mm256_bsrli_epi128(rhi2, 2));
r22 = _mm256_sub_epi16(r22, center_pixel_row23_0);
// g2 g3 g4 g5 g6 g7 g8 g9 | h2 h3 h4 h5 h6 h7 h8 h9
__m256i rgh2 =
_mm256_inserti128_si256(_mm256_castsi128_si256(src_g2), src_h2, 1);
__m256i r33 = _mm256_unpacklo_epi16(_mm256_bsrli_epi128(rhi2, 4),
_mm256_bsrli_epi128(rgh2, 6));
r33 = _mm256_sub_epi16(r33, center_pixel_row23_0);
r00 = _mm256_madd_epi16(r00, fc62);
r11 = _mm256_madd_epi16(r11, fc79);
r22 = _mm256_madd_epi16(r22, fc72);
r33 = _mm256_madd_epi16(r33, fc68);
// r20 r21 r22 r23 | r30 r31 r32 r33
r00 = _mm256_add_epi32(r00, r11);
r22 = _mm256_add_epi32(r22, r33);
r00 = _mm256_add_epi32(r00, r22);
accum_out_r2r3 = _mm256_add_epi32(accum_out_r2r3, r00);
// Offset addition
const __m128i offset_reg = _mm_set1_epi32(dc_offset);
const __m256i ofs = _mm256_inserti128_si256(
_mm256_castsi128_si256(offset_reg), offset_reg, 1);
accum_out_r0r1 = _mm256_add_epi32(accum_out_r0r1, ofs);
accum_out_r2r3 = _mm256_add_epi32(accum_out_r2r3, ofs);
// Rounding and clipping.
accum_out_r0r1 =
round_power_of_two_signed_avx2(accum_out_r0r1, filter_config->prec_bits);
accum_out_r2r3 =
round_power_of_two_signed_avx2(accum_out_r2r3, filter_config->prec_bits);
// r00 r01 r02 r03 | r20 r21 r22 r23 | r10 r11 r12 r13 | r30 r31 r32 r33
__m256i out_r0r2r1r3 = _mm256_packs_epi32(accum_out_r0r1, accum_out_r2r3);
const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
out_r0r2r1r3 = highbd_clamp_epi16(out_r0r2r1r3, _mm256_setzero_si256(), max);
// Store the output.
const int dst_id = block_row_begin * dst_stride + block_col_begin;
const __m128i out_r1r3 = _mm256_extractf128_si256(out_r0r2r1r3, 1);
_mm_storel_epi64((__m128i *)(dst + dst_id),
_mm256_castsi256_si128(out_r0r2r1r3));
_mm_storel_epi64((__m128i *)(dst + dst_id + (1 * dst_stride)), out_r1r3);
_mm_storel_epi64((__m128i *)(dst + dst_id + (2 * dst_stride)),
_mm_bsrli_si128(_mm256_castsi256_si128(out_r0r2r1r3), 8));
_mm_storel_epi64((__m128i *)(dst + dst_id + (3 * dst_stride)),
_mm_bsrli_si128(out_r1r3, 8));
}
// SIMD implementation to convolve a block of pixels with origin-symmetric,
// wiener non-separable filter corresponds to CONFIG_WIENER_NONSEP loop
// restoration. DIAMOND shape with 13/12-tap or 6-tap filter is used for
// convolution.
void av1_convolve_symmetric_subtract_center_highbd_avx2(
const uint16_t *dgd, int stride, const NonsepFilterConfig *filter_config,
const int16_t *filter, uint16_t *dst, int dst_stride, int bit_depth,
int block_row_begin, int block_row_end, int block_col_begin,
int block_col_end) {
assert(filter_config->subtract_center);
const int num_rows = block_row_end - block_row_begin;
const int num_cols = block_col_end - block_col_begin;
const int num_sym_taps = filter_config->num_pixels / 2;
// SIMD is mainly implemented for diamond shape filter with 13 taps (12
// symmetric + 1) or 6 taps for a block size of 4x4. For any other cases
// invoke the C function.
if (num_rows != 4 || num_cols != 4 ||
(num_sym_taps != 12 && num_sym_taps != 6)) {
av1_convolve_symmetric_subtract_center_highbd_c(
dgd, stride, filter_config, filter, dst, dst_stride, bit_depth,
block_row_begin, block_row_end, block_col_begin, block_col_end);
return;
}
// 6-tap implementation for convolve symmetric subtract center
if (num_sym_taps == 6) {
av1_convolve_symmetric_subtract_center_highbd_6tap_avx2(
dgd, stride, filter_config, filter, dst, dst_stride, bit_depth,
block_row_begin, block_col_begin);
return;
}
// 12-tap implementation for convolve symmetric subtract center
if (num_sym_taps == 12) {
av1_convolve_symmetric_subtract_center_highbd_12tap_avx2(
dgd, stride, filter_config, filter, dst, dst_stride, bit_depth,
block_row_begin, block_col_begin);
return;
}
}
// AVX2 intrinsic for convolve wiener non-separable dual loop restoration
// filtering. The output for a particular pixel in a 4x4 block is calculated
// with DIAMOND shaped filter considering a 5x5 grid surrounded by that pixel.
// Filter Coefficients: fc0 fc1 fc2 fc3 fc4 fc5 f6 f7 f8 f9 f10 f11
// 6-tap filtering for dgd (first) buffer:
// dgd_reg_a = a0 a1 a2 a3 a4 a5 a6 a7
// dgd_reg_b = b0 b1 b2 b3 b4 b5 b6 b7
// dgd_reg_c = c0 c1 c2 c3 c4 c5 c6 c7
// dgd_reg_d = d0 d1 d2 d3 d4 d5 d6 d7
// dgd_reg_e = e0 e1 e2 e3 e4 e5 e6 e7
// The output for a pixel located at c2 position is calculated as below.
// dgd_output_c2 = ((a2-c2)+(e2-c2))*fc4 + ((b1-c2+d3-c2))*fc2 +
// (b2-c2+d2-c2)*fc0 + (b3-c2+d1-c2)*fc3 + (c0-c2+c4-c2)*fc5 +
// (c1-c2+c3-c2)*fc1 + c2*singleton_tap + dc_offset
//
// 6-tap filtering for dgd_dual (second) buffer:
// dgd_dual_reg_a = a0 a1 a2 a3 a4 a5 a6 a7
// dgd_dual_reg_b = b0 b1 b2 b3 b4 b5 b6 b7
// dgd_dual_reg_c = c0 c1 c2 c3 c4 c5 c6 c7
// dgd_dual_reg_d = d0 d1 d2 d3 d4 d5 d6 d7
// dgd_dual_reg_e = e0 e1 e2 e3 e4 e5 e6 e7
// dgd_dual_output_c2 = ((a2-c2)+(e2-c2))*fc10 + ((b1-c2+d3-c2))*fc8 +
// (b2-c2+d2-c2)*fc6 + (b3-c2+d1-c2)*fc9 + (c0-c2+c4-c2)*fc11 +
// (c1-c2+c3-c2)*fc7
// output_c2 = dgd_output_c2 + dgd_dual_output_c2
// The source registers are unpacked such that the output corresponding to 2
// rows will be produced in a single register (i.e., processing 2 rows
// simultaneously).
//
// Example:
// The output corresponding to fc4 of rows 0 and 1 is achieved like below.
// __m256i centerpixel_row01 = c2 c2 c3 c3 c4 c4 c5 c5 | d2 d2 d3 d3 d4 d4 d5 d5
// __m256i src_reg3 = a2 e2 a3 e3 a4 e4 a5 e5 | b2 f2 b3 f3 b4 f4 b5 f5
// __m256i filter_4 = fc4 fc4 fc4 fc4 fc4 fc4 fc4 fc4 | fc4 fc4 fc4 fc4 fc4 fc4
// fc4 fc4
// __m256 src_reg3 = _mm256_sub_epi16(src_reg3, centerpixel_row01);
// __m256i out_f4_01 = _mm256_madd_epi16(src_reg3, filter_4);
// = ((a2-c2)*fc4+(e2-c2)*fc4) (a3-c3)*fc4+(e3-c3)*fc4) .. |
// (b2-d2)*fc4 +(f2-d2)*fc4) . .
// Here, out_f4_01 contains partial output of rows 0 and 1 corresponding to fc4.
void av1_convolve_symmetric_dual_subtract_center_highbd_avx2(
const uint16_t *dgd, int dgd_stride, const uint16_t *dgd_dual,
int dgd_dual_stride, const NonsepFilterConfig *filter_config,
const int16_t *filter, uint16_t *dst, int dst_stride, int bit_depth,
int block_row_begin, int block_row_end, int block_col_begin,
int block_col_end) {
assert(filter_config->subtract_center);
const int num_rows = block_row_end - block_row_begin;
const int num_cols = block_col_end - block_col_begin;
const int num_sym_taps = filter_config->num_pixels / 2;
const int num_sym_taps_dual = filter_config->num_pixels2 / 2;
// SIMD is mainly implemented for diamond shape filter with 6 taps for a block
// size of 4x4. For any other cases invoke the C function.
if (num_rows != 4 || num_cols != 4 || num_sym_taps != 6 ||
num_sym_taps_dual != 6) {
av1_convolve_symmetric_dual_subtract_center_highbd_c(
dgd, dgd_stride, dgd_dual, dgd_dual_stride, filter_config, filter, dst,
dst_stride, bit_depth, block_row_begin, block_row_end, block_col_begin,
block_col_end);
return;
}
const int32_t singleton_tap = 1 << filter_config->prec_bits;
int32_t dc_offset = 0;
if (filter_config->num_pixels % 2) {
const int dc_offset_tap_index =
filter_config->config[filter_config->num_pixels - 1][NONSEP_BUF_POS];
dc_offset = filter[dc_offset_tap_index];
}
// Prepare filter coefficients for dgd buffer 6-tap filtering
// fc0 fc1 fc2 fc3 fc4 fc5 center_tap x
__m128i filter_coeff = _mm_loadu_si128((__m128i const *)(filter));
// Replace the center_tap with derived singleton_tap.
const __m128i center_tap = _mm_set1_epi16(singleton_tap);
const __m128i filter_coeff_dgd =
_mm_blend_epi16(filter_coeff, center_tap, 0x40);
// Prepare filter coefficients for dgd_dual buffer 6-tap filtering
// fc6 fc7 fc8 fc9 fc10 fc11 0 0
filter_coeff = _mm_loadu_si128((__m128i const *)(filter + 4));
const __m128i filter_coeff_dgd_dual = _mm_bsrli_si128(filter_coeff, 4);
// Initialize the output registers with zero
__m256i accum_out_r0r1 = _mm256_setzero_si256();
__m256i accum_out_r2r3 = _mm256_setzero_si256();
// 6-tap filtering for dgd (first) buffer
apply_6tap_filtering(dgd, dgd_stride, filter_coeff_dgd, &accum_out_r0r1,
&accum_out_r2r3, block_row_begin, block_col_begin);
// 6-tap filtering for dgd_dual (second) buffer
apply_6tap_filtering(dgd_dual, dgd_dual_stride, filter_coeff_dgd_dual,
&accum_out_r0r1, &accum_out_r2r3, block_row_begin,
block_col_begin);
// Offset addition
const __m128i offset_reg = _mm_set1_epi32(dc_offset);
const __m256i ofs = _mm256_inserti128_si256(
_mm256_castsi128_si256(offset_reg), offset_reg, 1);
accum_out_r0r1 = _mm256_add_epi32(accum_out_r0r1, ofs);
accum_out_r2r3 = _mm256_add_epi32(accum_out_r2r3, ofs);
// Store the output after rounding and clipping
round_and_store_avx2(dst, dst_stride, filter_config, bit_depth,
accum_out_r0r1, accum_out_r2r3, block_row_begin,
block_col_begin);
}
/* Computes the gradient across different directions for a given pixel using 3
* tap filter. The following shows gradient calculation
* A0 V0 D0
* H0 C H1
* D1 V1 A1
* At a given pixel position C,
* Horzontal gradient = H0 - 2*C + H1
* Vertical gradient = V0 - 2*C + V1
* Diagonal gradient = D0 - 2*C + D1
* Anti diagonal gradient = A0 - 2*C + A1
* Feature lines buffers are updated here by taking the absolute of these
* gradient information.
*/
void calc_gradient_in_various_directions_avx2(
int16_t *feature_line_buffers[], int row, int buffer_row,
const uint16_t *dgd, int dgd_stride, int width, int col_begin, int col_end,
int feature_length, int buffer_col) {
const int buffer_row_0 = buffer_row;
const int buffer_row_1 = buffer_row_0 + feature_length;
const int buffer_row_2 = buffer_row_1 + feature_length;
const int buffer_row_3 = buffer_row_2 + feature_length;
const int16_t *line_buf0 = feature_line_buffers[buffer_row_0];
const int16_t *line_buf1 = feature_line_buffers[buffer_row_1];
const int16_t *line_buf2 = feature_line_buffers[buffer_row_2];
const int16_t *line_buf3 = feature_line_buffers[buffer_row_3];
const int tot_width = col_end - col_begin;
// Width for which SIMD is possible due to constrains of odd width.
const int pos_simd_width = AOMMIN(tot_width, width + 3 - 2);
const uint16_t *cur_row = dgd + (row * dgd_stride) + col_begin;
const uint16_t *prev_row = cur_row - dgd_stride;
const uint16_t *next_row = cur_row + dgd_stride;
// Process width multiples of 16.
for (int wd = 0; wd + 16 < pos_simd_width; wd += 16) {
// p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p11 p12 p13 p14 p15 p16
const __m256i src_p = _mm256_loadu_si256((__m256i const *)(prev_row));
// c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 c16
const __m256i src_c = _mm256_loadu_si256((__m256i const *)(cur_row));
// n1 n2 n3 n4 n5 n6 n7 n8 n9 n10 n11 n12 n13 n14 n15 n16
const __m256i src_n = _mm256_loadu_si256((__m256i const *)(next_row));
// p0 p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p11 p12 p13 p14 p15
const __m256i src_pm1 = _mm256_loadu_si256((__m256i const *)(prev_row - 1));
// c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15
const __m256i src_cm1 = _mm256_loadu_si256((__m256i const *)(cur_row - 1));
// n0 n1 n2 n3 n4 n5 n6 n7 n8 n9 n10 n11 n12 n13 n14 n15
const __m256i src_nm1 = _mm256_loadu_si256((__m256i const *)(next_row - 1));
// p2 p3 p4 p5 p6 p7 p8 p9 p10 p11 p12 p13 p14 p15 p16 p17
const __m256i src_pp1 = _mm256_loadu_si256((__m256i const *)(prev_row + 1));
// c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 c16 c17
const __m256i src_cp1 = _mm256_loadu_si256((__m256i const *)(cur_row + 1));
// n2 n3 n4 n5 n6 n7 n8 n9 n10 n11 n12 n13 n14 n15 n16 n17
const __m256i src_np1 = _mm256_loadu_si256((__m256i const *)(next_row + 1));
// Horizontal Gradient calculation
const __m256i base_val = _mm256_add_epi16(src_c, src_c);
const __m256i partial_horz_diff = _mm256_add_epi16(src_cm1, src_cp1);
const __m256i horz_diff = _mm256_sub_epi16(partial_horz_diff, base_val);
const __m256i abs_horz_diff = _mm256_abs_epi16(horz_diff);
_mm256_storeu_si256((__m256i *)(line_buf0 + wd), abs_horz_diff);
// Vertical Gradient calculation
const __m256i partial_vert_diff = _mm256_add_epi16(src_p, src_n);
const __m256i vert_diff = _mm256_sub_epi16(partial_vert_diff, base_val);
const __m256i abs_vert_diff = _mm256_abs_epi16(vert_diff);
_mm256_storeu_si256((__m256i *)(line_buf1 + wd), abs_vert_diff);
// Diagonal Gradient calculation
const __m256i partial_diag_diff = _mm256_add_epi16(src_pm1, src_np1);
const __m256i diag_diff = _mm256_sub_epi16(partial_diag_diff, base_val);
const __m256i abs_diag_diff = _mm256_abs_epi16(diag_diff);
_mm256_storeu_si256((__m256i *)(line_buf3 + wd), abs_diag_diff);
// Anti-Diagonal Gradient calculation
const __m256i partial_anti_diag_diff = _mm256_add_epi16(src_pp1, src_nm1);
const __m256i anti_diag_diff =
_mm256_sub_epi16(partial_anti_diag_diff, base_val);
const __m256i abs_anti_diag_diff = _mm256_abs_epi16(anti_diag_diff);
_mm256_storeu_si256((__m256i *)(line_buf2 + wd), abs_anti_diag_diff);
cur_row += 16;
prev_row += 16;
next_row += 16;
buffer_col += 16;
}
// Invoke C function to process remaining width.
const int remaining_width = tot_width - buffer_col;
if (remaining_width)
calc_gradient_in_various_directions_c(
feature_line_buffers, row, buffer_row, dgd, dgd_stride, width,
buffer_col - 1, col_end, feature_length, buffer_col);
}
void prepare_feature_sum_bufs_avx2(int *feature_sum_buffers[],
int16_t *feature_line_buffers[],
int feature_length, int buffer_row,
int col_begin, int col_end, int buffer_col) {
const int buffer_row_0 = buffer_row;
const int buffer_row_1 = buffer_row_0 + feature_length;
const int buffer_row_2 = buffer_row_1 + feature_length;
const int buffer_row_3 = buffer_row_2 + feature_length;
const int16_t *line_buf0 = feature_line_buffers[buffer_row_0];
const int16_t *line_buf1 = feature_line_buffers[buffer_row_1];
const int16_t *line_buf2 = feature_line_buffers[buffer_row_2];
const int16_t *line_buf3 = feature_line_buffers[buffer_row_3];
int *sum_buf0 = feature_sum_buffers[0];
int *sum_buf1 = feature_sum_buffers[1];
int *sum_buf2 = feature_sum_buffers[2];
int *sum_buf3 = feature_sum_buffers[3];
const int tot_width = col_end - col_begin;
// Process width multiples of 8.
for (int wd = 0; wd + 8 < tot_width; wd += 8) {
const __m128i line0 = _mm_loadu_si128((__m128i const *)(line_buf0 + wd));
const __m128i line1 = _mm_loadu_si128((__m128i const *)(line_buf1 + wd));
const __m128i line2 = _mm_loadu_si128((__m128i const *)(line_buf2 + wd));
const __m128i line3 = _mm_loadu_si128((__m128i const *)(line_buf3 + wd));
const __m256i sum0 = _mm256_loadu_si256((__m256i const *)(sum_buf0 + wd));
const __m256i sum1 = _mm256_loadu_si256((__m256i const *)(sum_buf1 + wd));
const __m256i sum2 = _mm256_loadu_si256((__m256i const *)(sum_buf2 + wd));
const __m256i sum3 = _mm256_loadu_si256((__m256i const *)(sum_buf3 + wd));
const __m256i line00 = _mm256_cvtepi16_epi32(line0);
const __m256i line11 = _mm256_cvtepi16_epi32(line1);
const __m256i line22 = _mm256_cvtepi16_epi32(line2);
const __m256i line33 = _mm256_cvtepi16_epi32(line3);
const __m256i sub0 = _mm256_sub_epi32(sum0, line00);
const __m256i sub1 = _mm256_sub_epi32(sum1, line11);
const __m256i sub2 = _mm256_sub_epi32(sum2, line22);
const __m256i sub3 = _mm256_sub_epi32(sum3, line33);
_mm256_storeu_si256((__m256i *)(sum_buf0 + wd), sub0);
_mm256_storeu_si256((__m256i *)(sum_buf1 + wd), sub1);
_mm256_storeu_si256((__m256i *)(sum_buf2 + wd), sub2);
_mm256_storeu_si256((__m256i *)(sum_buf3 + wd), sub3);
buffer_col += 8;
}
// Invoke C function to process remaining width.
const int remaining_width = tot_width - buffer_col;
if (remaining_width)
prepare_feature_sum_bufs_c(feature_sum_buffers, feature_line_buffers,
feature_length, buffer_row, 0, remaining_width,
buffer_col);
}
static void update_feature_sum_bufs_avx2(int *feature_sum_buffers[],
int16_t *feature_line_buffers[],
int feature_length, int buffer_row,
int col_begin, int col_end,
int buffer_col) {
const int buffer_row_0 = buffer_row;
const int buffer_row_1 = buffer_row_0 + feature_length;
const int buffer_row_2 = buffer_row_1 + feature_length;
const int buffer_row_3 = buffer_row_2 + feature_length;
const int16_t *line_buf0 = feature_line_buffers[buffer_row_0];
const int16_t *line_buf1 = feature_line_buffers[buffer_row_1];
const int16_t *line_buf2 = feature_line_buffers[buffer_row_2];
const int16_t *line_buf3 = feature_line_buffers[buffer_row_3];
int *sum_buf0 = feature_sum_buffers[0];
int *sum_buf1 = feature_sum_buffers[1];
int *sum_buf2 = feature_sum_buffers[2];
int *sum_buf3 = feature_sum_buffers[3];
const int tot_width = col_end - col_begin;
for (int wd = 0; wd + 8 < tot_width; wd += 8) {
const __m128i line0 = _mm_loadu_si128((__m128i const *)(line_buf0 + wd));
const __m128i line1 = _mm_loadu_si128((__m128i const *)(line_buf1 + wd));
const __m128i line2 = _mm_loadu_si128((__m128i const *)(line_buf2 + wd));
const __m128i line3 = _mm_loadu_si128((__m128i const *)(line_buf3 + wd));
const __m256i fsum0 = _mm256_loadu_si256((__m256i const *)(sum_buf0 + wd));
const __m256i fsum1 = _mm256_loadu_si256((__m256i const *)(sum_buf1 + wd));
const __m256i fsum2 = _mm256_loadu_si256((__m256i const *)(sum_buf2 + wd));
const __m256i fsum3 = _mm256_loadu_si256((__m256i const *)(sum_buf3 + wd));
const __m256i line00 = _mm256_cvtepi16_epi32(line0);
const __m256i line11 = _mm256_cvtepi16_epi32(line1);
const __m256i line22 = _mm256_cvtepi16_epi32(line2);
const __m256i line33 = _mm256_cvtepi16_epi32(line3);
const __m256i sub0 = _mm256_add_epi32(fsum0, line00);
const __m256i sub1 = _mm256_add_epi32(fsum1, line11);
const __m256i sub2 = _mm256_add_epi32(fsum2, line22);
const __m256i sub3 = _mm256_add_epi32(fsum3, line33);
_mm256_storeu_si256((__m256i *)(sum_buf0 + wd), sub0);
_mm256_storeu_si256((__m256i *)(sum_buf1 + wd), sub1);
_mm256_storeu_si256((__m256i *)(sum_buf2 + wd), sub2);
_mm256_storeu_si256((__m256i *)(sum_buf3 + wd), sub3);
buffer_col += 8;
}
// Invoke C function to process remaining width.
const int remaining_width = tot_width - buffer_col;
if (remaining_width)
update_feature_sum_bufs_c(feature_sum_buffers, feature_line_buffers,
feature_length, buffer_row, 0, remaining_width,
buffer_col);
}
void fill_directional_feature_buffers_highbd_avx2(
int *feature_sum_buffers[], int16_t *feature_line_buffers[], int row,
int buffer_row, const uint16_t *dgd, int dgd_stride, int width,
int feature_lead, int feature_lag) {
const int feature_length = feature_lead + feature_lag + 1;
const int col_begin = -feature_lead;
const int col_end = width + feature_lag;
int buffer_col = 0;
// In the below AVX2 functions, total width to be processed is calculated
// using col_end-col_begin. Hence, this assert is needed for width to be >= 0
// always.
assert(col_begin <= col_end);
// Preparation of feature sum buffers by subtracting the feature line buffers.
prepare_feature_sum_bufs_avx2(feature_sum_buffers, feature_line_buffers,
feature_length, buffer_row, col_begin, col_end,
buffer_col);
// Compute the gradient across different directions.
calc_gradient_in_various_directions_avx2(
feature_line_buffers, row, buffer_row, dgd, dgd_stride, width, col_begin,
col_end, feature_length, buffer_col);
// Update the feature sum buffers with updated feature line buffers.
update_feature_sum_bufs_avx2(feature_sum_buffers, feature_line_buffers,
feature_length, buffer_row, col_begin, col_end,
buffer_col);
}
static const uint8_t shuffle_mask_low[32] = { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5,
5, 5, 6, 6, 6, 6, 7, 7, 7, 7 };
static const uint8_t shuffle_mask_high[32] = { 8, 8, 8, 8, 9, 9, 9, 9,
10, 10, 10, 10, 11, 11, 11, 11,
12, 12, 12, 12, 13, 13, 13, 13,
14, 14, 14, 14, 15, 15, 15, 15 };
static AOM_INLINE void set_left_and_right_tskip_sum(int8_t *tskip_sum_buffer,
int col_begin, int width,
int col_end,
int8_t left_tskip,
int8_t right_tskip) {
int buffer_col = 0;
for (int col = col_begin; col < 0; ++col) {
tskip_sum_buffer[buffer_col] = left_tskip;
++buffer_col;
}
buffer_col += width;
for (int col = width; col < col_end; ++col) {
tskip_sum_buffer[buffer_col] = right_tskip;
++buffer_col;
}
}
void av1_fill_tskip_sum_buffer_avx2(int row, const uint8_t *tskip,
int tskip_stride, int8_t *tskip_sum_buffer,
int width, int height, int tskip_lead,
int tskip_lag, bool use_strict_bounds) {
if ((width != 64) && (width != 32)) {
av1_fill_tskip_sum_buffer_c(row, tskip, tskip_stride, tskip_sum_buffer,
width, height, tskip_lead, tskip_lag,
use_strict_bounds);
return;
}
// TODO(oguleryuz): tskip needs boundary extension.
assert(use_strict_bounds == true);
const int col_begin = -tskip_lead;
const int col_end = width + tskip_lag;
const int clamped_row = AOMMAX(AOMMIN(row, height - 1), 0);
const int tskip_id_base_add = (clamped_row >> MI_SIZE_LOG2) * tskip_stride;
const int tskip_length = tskip_lead + tskip_lag + 1;
int subtract_row = row - tskip_length;
int tskip_id_base_sub = 0;
if (subtract_row >= -tskip_lead) {
assert(subtract_row <= height - 1);
subtract_row = subtract_row >= 0 ? subtract_row : 0;
tskip_id_base_sub = (subtract_row >> MI_SIZE_LOG2) * tskip_stride;
}
if (width == 64) {
__m128i tx_skip_add =
_mm_loadu_si128((__m128i *)(tskip + tskip_id_base_add));
if (subtract_row >= -tskip_lead) {
const __m128i tx_skip_sub =
_mm_loadu_si128((__m128i *)(tskip + tskip_id_base_sub));
tx_skip_add = _mm_sub_epi8(tx_skip_add, tx_skip_sub);
}
// tx_skip_add
// a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15
const __m256i tx_skip_add_temp = _mm256_broadcastsi128_si256(tx_skip_add);
// tx_skip_add_low
// a0 a0 a0 a0 a1 a1 a1 a1 a2 a2 a2 a2 a3 a3 a3 a3
// a4 a4 a4 a4 a5 a5 a5 a5 a6 a6 a6 a6 a7 a7 a7 a7
const __m256i tx_skip_add_low = _mm256_shuffle_epi8(
tx_skip_add_temp, _mm256_loadu_si256((__m256i *)shuffle_mask_low));
// tx_skip_add_high
// a8 a8 a8 a8 a9 a9 a9 a9 a10 a10 a10 a10 a11 a11 a11 a11
// a12 a12 a12 a12 a13 a13 a13 a13 a14 a14 a14 a14 a15 a15 a15 a15
const __m256i tx_skip_add_high = _mm256_shuffle_epi8(
tx_skip_add_temp, _mm256_loadu_si256((__m256i *)shuffle_mask_high));
const __m256i tskip_0 =
_mm256_loadu_si256((__m256i *)(tskip_sum_buffer - col_begin));
const __m256i tskip_1 =
_mm256_loadu_si256((__m256i *)(tskip_sum_buffer - col_begin + 32));
const __m256i tskip_sum_low = _mm256_add_epi8(tskip_0, tx_skip_add_low);
const __m256i tskip_sum_high = _mm256_add_epi8(tskip_1, tx_skip_add_high);
_mm256_storeu_si256((__m256i *)(tskip_sum_buffer - col_begin),
tskip_sum_low);
_mm256_storeu_si256((__m256i *)(tskip_sum_buffer - col_begin + 32),
tskip_sum_high);
// Extend data from [col_begin, 0) and [width, col_end).
const int8_t left_tskip = _mm256_extract_epi8(tskip_sum_low, 0);
if (col_begin == -1 && (col_end == (width + 4))) {
*((int8_t *)tskip_sum_buffer) = left_tskip;
int32_t col_end_value = _mm256_extract_epi32(tskip_sum_high, 7);
memcpy((tskip_sum_buffer - col_begin + width), &col_end_value,
sizeof(int32_t));
} else {
const int8_t right_tskip = _mm256_extract_epi8(tskip_sum_high, 31);
set_left_and_right_tskip_sum(tskip_sum_buffer, col_begin, width, col_end,
left_tskip, right_tskip);
}
} else if (width == 32) {
__m128i tx_skip_add =
_mm_loadl_epi64((__m128i *)(tskip + tskip_id_base_add));
if (subtract_row >= -tskip_lead) {
const __m128i tx_skip_sub =
_mm_loadl_epi64((__m128i *)(tskip + tskip_id_base_sub));
tx_skip_add = _mm_sub_epi8(tx_skip_add, tx_skip_sub);
}
const __m256i tx_skip_add_temp = _mm256_broadcastsi128_si256(tx_skip_add);
// tx_skip_add_low
// a0 a0 a0 a0 a1 a1 a1 a1 a2 a2 a2 a2 a3 a3 a3 a3
// a4 a4 a4 a4 a5 a5 a5 a5 a6 a6 a6 a6 a7 a7 a7 a7
const __m256i tx_skip_add_low = _mm256_shuffle_epi8(
tx_skip_add_temp, _mm256_loadu_si256((__m256i *)shuffle_mask_low));
const __m256i tskip_0 =
_mm256_loadu_si256((__m256i *)(tskip_sum_buffer - col_begin));
const __m256i tskip_sum_low = _mm256_add_epi8(tskip_0, tx_skip_add_low);
_mm256_storeu_si256((__m256i *)(tskip_sum_buffer - col_begin),
tskip_sum_low);
// Extend data from [col_begin, 0) and [width, col_end).
const int8_t left_tskip = _mm256_extract_epi8(tskip_sum_low, 0);
const int8_t right_tskip = _mm256_extract_epi8(tskip_sum_low, 31);
if (col_begin == -1 && (col_end == (width + 1))) {
*((int8_t *)tskip_sum_buffer) = left_tskip;
*((int8_t *)(tskip_sum_buffer - col_begin + width)) = right_tskip;
} else {
set_left_and_right_tskip_sum(tskip_sum_buffer, col_begin, width, col_end,
left_tskip, right_tskip);
}
}
}
#define LOAD_INPUT_DATA(cb, cl) \
/*A0 A1 A2 A3 A4 A5 A6 A7*/ \
src_A = _mm256_loadu_si256((__m256i *)(lb0 + cb)); \
/* B0 B1 B2 B3 B4 B5 B6 B7 */ \
src_B = _mm256_loadu_si256((__m256i *)(lb1 + cb)); \
/* C0 C1 C2 C3 C4 C5 C6 C7 */ \
src_C = _mm256_loadu_si256((__m256i *)(lb2 + cb)); \
/* D0 D1 D2 D3 D4 D5 D6 D7 */ \
src_D = _mm256_loadu_si256((__m256i *)(lb3 + cb)); \
/* a0 a1 a2 a3 a4 a5 a6 a7 */ \
src_a = _mm256_loadu_si256((__m256i *)(lb0 + cl)); \
/* b0 b1 b2 b3 b4 b5 b6 b7*/ \
src_b = _mm256_loadu_si256((__m256i *)(lb1 + cl)); \
/* c0 c1 c2 c3 c4 c5 c6 c7 */ \
src_c = _mm256_loadu_si256((__m256i *)(lb2 + cl)); \
/* d0 d1 d2 d3 d4 d5 d6 d7 */ \
src_d = _mm256_loadu_si256((__m256i *)(lb3 + cl));
static AOM_INLINE void process_feature_accum_wd8(__m256i src_A, __m256i src_B,
__m256i src_C, __m256i src_D,
__m256i src_a, __m256i src_b,
__m256i src_c, __m256i src_d,
__m256i *result) {
// ra0 ra1 ra2 ra3 ra4 ra5 ra6 ra7
const __m256i diff_Aa = _mm256_sub_epi32(src_A, src_a);
// rb0 rb1 rb2 rb3 rb4 rb5 rb6 rb7
const __m256i diff_Bb = _mm256_sub_epi32(src_B, src_b);
// rc0 rc1 rc2 rc3 rc4 rc5 rc6 rc7
const __m256i diff_Cc = _mm256_sub_epi32(src_C, src_c);
// rd0 rd1 rd2 rd3 rd4 rd5 rd6 rd7
const __m256i diff_Dd = _mm256_sub_epi32(src_D, src_d);
// ra0+ra1 ra2+ra3 rb0+rb1 rb2+rb3 ra4+ra5 ra6+ra7 rb4+rb5 rb6+rb7
const __m256i result0 = _mm256_hadd_epi32(diff_Aa, diff_Bb);
// rc0+rc1 rc2+rc3 rd0+rd1 rd2+rd3 rc4+rc5 rc6+rc7 rd4+rd5 rd6+rd7
const __m256i result1 = _mm256_hadd_epi32(diff_Cc, diff_Dd);
// ra0+ra1+ra2+ra3 rb0+rb1+rb2+rb3 rc0+rc1+rc2+rc3 rd0+rd1+rd2+rd3 --
// ra4+ra5+ra6+ra7 rb4+rb5+rb6+rb7 rc4+rc5+rc6+rc7 rd4+rd5+rd6+rd7
*result = _mm256_hadd_epi32(result0, result1);
}
static AOM_INLINE void unpack_results_and_store(
int dir_feature_accum[][PC_WIENER_FEATURE_ACC_SIZE], __m256i result0,
__m256i result1, int cur_col) {
const int cur_idx = cur_col / 4;
const int *sb0 = dir_feature_accum[0];
const int *sb1 = dir_feature_accum[1];
const int *sb2 = dir_feature_accum[2];
const int *sb3 = dir_feature_accum[3];
const __m128i accumu_r0 = _mm_set1_epi32(dir_feature_accum[0][cur_idx]);
const __m128i accumu_r1 = _mm_set1_epi32(dir_feature_accum[1][cur_idx]);
const __m128i accumu_r2 = _mm_set1_epi32(dir_feature_accum[2][cur_idx]);
const __m128i accumu_r3 = _mm_set1_epi32(dir_feature_accum[3][cur_idx]);
// ra0+ra1+ra2+ra3 rb0+rb1+rb2+rb3 rc0+rc1+rc2+rc3 rd0+rd1+rd2+rd3
const __m128i f1 = _mm256_castsi256_si128(result0);
// ra4+ra5+ra6+ra7 rb4+rb5+rb6+rb7 rc4+rc5+rc6+rc7 rd4+rd5+rd6+rd7
const __m128i fresult01 = _mm256_extractf128_si256(result0, 1);
// a0-a7 b0-b7 c0-c7 d0-d7
const __m128i f2 = _mm_add_epi32(f1, fresult01);
// ra8+ra9+ra10+ra11 rb8+rb9+rb10+rb11 rc8+rc9+rc10+rc11 rd8+rd9+rd10+rd11
const __m128i fresult10 = _mm256_castsi256_si128(result1);
// a0-a11 b0-b11 c0-c11 d0-d11
const __m128i f3 = _mm_add_epi32(f2, fresult10);
// ra12+ra13+ra14+ra15 rb12+rb13+rb14+rb15 rc12+rc13+rc14+rc15
// rd12+rd13+rd14+rd15
const __m128i fresult11 = _mm256_extractf128_si256(result1, 1);
// a0-a15 b0-b15 c0-c15 d0-d15
const __m128i f4 = _mm_add_epi32(f3, fresult11);
// ra0-ra3 ra0-ra7 rb0-rb3 rb0-rb7
const __m128i r00 = _mm_unpacklo_epi32(f1, f2);
// rc0-rc3 rc0-rc7 rd0-rd3 rd0-rd7
const __m128i r20 = _mm_unpackhi_epi32(f1, f2);
// ra0-ra11 ra0-ra15 rb0-rb11 rb0-rb15
const __m128i r01 = _mm_unpacklo_epi32(f3, f4);
// rc0-rc11 rc0-rc15 rd0-rd11 rd0-rd15
const __m128i r21 = _mm_unpackhi_epi32(f3, f4);
// ra0-ra3 ra0-ra7 ra0-ra11 ra0-ra15
__m128i r0 = _mm_unpacklo_epi64(r00, r01);
// rb0-rb3 rb0-rb7 rb0-rb11 rb0-rb15
__m128i r1 = _mm_unpackhi_epi64(r00, r01);
// rc0-rc3 rc0-rc7 rc0-rc11 rc0-rc15
__m128i r2 = _mm_unpacklo_epi64(r20, r21);
// rd0-rd3 rd0-rd7 rd0-rd11 rd0-rd15
__m128i r3 = _mm_unpackhi_epi64(r20, r21);
r0 = _mm_add_epi32(r0, accumu_r0);
r1 = _mm_add_epi32(r1, accumu_r1);
r2 = _mm_add_epi32(r2, accumu_r2);
r3 = _mm_add_epi32(r3, accumu_r3);
_mm_storeu_si128((__m128i *)(sb0 + cur_idx + 1), r0);
_mm_storeu_si128((__m128i *)(sb1 + cur_idx + 1), r1);
_mm_storeu_si128((__m128i *)(sb2 + cur_idx + 1), r2);
_mm_storeu_si128((__m128i *)(sb3 + cur_idx + 1), r3);
}
static AOM_INLINE void process_feature_accum_wd16(
int dir_feature_accum[][PC_WIENER_FEATURE_ACC_SIZE], int *feature_sum_buf[],
int cur_col, int cb, int feature_length) {
const int cl = cb - feature_length;
const int *lb0 = feature_sum_buf[0];
const int *lb1 = feature_sum_buf[1];
const int *lb2 = feature_sum_buf[2];
const int *lb3 = feature_sum_buf[3];
__m256i src_A, src_B, src_C, src_D, src_a, src_b, src_c, src_d;
// Process the first 8 pixels.
LOAD_INPUT_DATA(cb, cl)
__m256i result0 = _mm256_set1_epi16(0);
process_feature_accum_wd8(src_A, src_B, src_C, src_D, src_a, src_b, src_c,
src_d, &result0);
// Process the next 8 pixels.
LOAD_INPUT_DATA(cb + 8, cl + 8)
__m256i result1 = _mm256_set1_epi16(0);
process_feature_accum_wd8(src_A, src_B, src_C, src_D, src_a, src_b, src_c,
src_d, &result1);
unpack_results_and_store(dir_feature_accum, result0, result1, cur_col);
}
static AOM_INLINE void process_feature_accum_wd15(
int dir_feature_accum[][PC_WIENER_FEATURE_ACC_SIZE], int *feature_sum_buf[],
int cur_col, int cb, int feature_length) {
const int cl = cb - feature_length;
const int *lb0 = feature_sum_buf[0];
const int *lb1 = feature_sum_buf[1];
const int *lb2 = feature_sum_buf[2];
const int *lb3 = feature_sum_buf[3];
__m256i src_A, src_B, src_C, src_D, src_a, src_b, src_c, src_d;
// Process the first 8 pixels.
LOAD_INPUT_DATA(cb, cl);
__m256i result0 = _mm256_set1_epi16(0);
process_feature_accum_wd8(src_A, src_B, src_C, src_D, src_a, src_b, src_c,
src_d, &result0);
// Process the next 8 pixels.
LOAD_INPUT_DATA(cb + 8, cl + 8);
const __m256i zero = _mm256_setzero_si256();
src_A = _mm256_blend_epi32(src_A, zero, 0x80);
src_B = _mm256_blend_epi32(src_B, zero, 0x80);
src_C = _mm256_blend_epi32(src_C, zero, 0x80);
src_D = _mm256_blend_epi32(src_D, zero, 0x80);
src_a = _mm256_blend_epi32(src_a, zero, 0x80);
src_b = _mm256_blend_epi32(src_b, zero, 0x80);
src_c = _mm256_blend_epi32(src_c, zero, 0x80);
src_d = _mm256_blend_epi32(src_d, zero, 0x80);
__m256i result1 = zero;
process_feature_accum_wd8(src_A, src_B, src_C, src_D, src_a, src_b, src_c,
src_d, &result1);
unpack_results_and_store(dir_feature_accum, result0, result1, cur_col);
}
void av1_fill_directional_feature_accumulators_avx2(
int dir_feature_accum[NUM_PC_WIENER_FEATURES][PC_WIENER_FEATURE_ACC_SIZE],
int *feature_sum_buf[NUM_PC_WIENER_FEATURES], int width, int col_offset,
int feature_lead, int feature_lag) {
// SIMD is specifically implemented for pc_wiener block size equal to 4x4 and
// restoration unit size must be 64x64 for luma or 32x32 for chroma plane.
if ((PC_WIENER_BLOCK_SIZE != 4) || ((width != 64) && (width != 32))) {
// TODO(oguleryuz): Reduce the cases where we punt to c-code.
av1_fill_directional_feature_accumulators_c(
dir_feature_accum, feature_sum_buf, width, col_offset, feature_lead,
feature_lag);
return;
}
// Process width equal to 0 case here.
int cur_col = 0;
const int feature_length = feature_lead + feature_lag + 1;
int cb = cur_col + col_offset + feature_lead;
for (int k = 0; k < NUM_PC_WIENER_FEATURES; k++) {
dir_feature_accum[k][0] += feature_sum_buf[k][cb];
}
++cb;
// Process the remaining width here.
if (width == 64) {
// Process next 16 (i.e., 1 to 16) pixels here.
cur_col = 1;
process_feature_accum_wd16(dir_feature_accum, feature_sum_buf, cur_col, cb,
feature_length);
// Process next 16 (i.e., 17 to 32) pixels here.
cur_col += 16;
cb += 16;
process_feature_accum_wd16(dir_feature_accum, feature_sum_buf, cur_col, cb,
feature_length);
// Process next 16 (i.e., 33 to 48) pixels here.
cur_col += 16;
cb += 16;
process_feature_accum_wd16(dir_feature_accum, feature_sum_buf, cur_col, cb,
feature_length);
// Process remaining 15 (i.e., 49 to 63) pixels here.
cur_col += 16;
cb += 16;
process_feature_accum_wd15(dir_feature_accum, feature_sum_buf, cur_col, cb,
feature_length);
} else if (width == 32) {
// Process next 16 pixels here.
cur_col = 1;
process_feature_accum_wd16(dir_feature_accum, feature_sum_buf, cur_col, cb,
feature_length);
// Process remaining 15 pixels here.
cur_col += 16;
cb += 16;
process_feature_accum_wd15(dir_feature_accum, feature_sum_buf, cur_col, cb,
feature_length);
} else {
// For any other case, C support is added. So this assert should not be
// invoked.
assert(0);
}
}
// To make last 8bit element of 256bit register to zero.
static const int8_t blend_mask[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, -1 };
static AOM_INLINE void process_accumulation_wd32(const __m256i src_A,
const __m256i src_a,
const __m128i accum_reg,
int16_t *tskip_out) {
// d1 - - - - d32
const __m256i diff_Aa = _mm256_sub_epi8(src_A, src_a);
const __m256i diff0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(diff_Aa));
const __m256i diff1 =
_mm256_cvtepi8_epi16(_mm256_extractf128_si256(diff_Aa, 1));
// d1+d2 d3+d4 d5+d6 d7+d8 d17+d18 d19+d20 d21+d22 d23+d24 |
// d9+d10 d11+d12 d13+d14 d15+d16 d25+d26 d27+d28 d29+d30 d31+d32
const __m256i r1 = _mm256_hadd_epi16(diff0, diff1);
// d1+d2 d3+d4 d5+d6 d7+d8 d9+d10 d11+d12 d13+d14 d15+d16 |
// d17+d18 d19+d20 d21+d22 d23+d24 d25+d26 d27+d28 d29+d30 d31+d32
__m256i r2 = _mm256_permute4x64_epi64(r1, 0xd8);
// sum of: d1d4 d5d8 d9d12 d13d16 d17d20 d21d24 d25d28 d29d32
r2 = _mm256_hadd_epi16(
r2, _mm256_castsi128_si256(_mm256_extractf128_si256(r2, 1)));
// sum of: d1d4 d5d8 d9d12 d13d16 d17d20 d21d24 d25d28 d29d32
// const __m256i r2 = _mm256_permute4x64_epi64(r1, 0x54);
// d1d4 d5d8 d9d12 d13d16 d17d20 d21d24 d25d28 d29d32 | x
const __m128i low128 = _mm256_castsi256_si128(r2);
// 0 d1d4 d5d8 d9d12 d13d16 d17d20 d21d24 d25d28
__m128i tmp_shift = _mm_bslli_si128(low128, 2);
// d1d4 d1d8 d5d12 d9d16 d13d20 d17d24 d21d28 d25d32
const __m128i result0 = _mm_add_epi16(low128, tmp_shift);
// 0 0 d1d4 d1d8 d5d12 d9d16 d13d20 d17d24
tmp_shift = _mm_bslli_si128(result0, 4);
// d1d4 d1d8 d1d12 d1d16 d5d20 d9d24 d13d28 d17d32
const __m128i result1 = _mm_add_epi16(result0, tmp_shift);
// 0 0 0 0 d1d4 d1d8 d1d12 d1d16
tmp_shift = _mm_bslli_si128(result1, 8);
// d1d4 d1d8 d1d12 d1d16 d1d20 d1d24 d1d28 d1d32
const __m128i result2 = _mm_add_epi16(result1, tmp_shift);
// Add the 0th result to all the values
const __m128i result = _mm_add_epi16(accum_reg, result2);
_mm_storeu_si128((__m128i *)(tskip_out), result);
}
void av1_fill_tskip_feature_accumulator_avx2(
int16_t tskip_feature_accum[PC_WIENER_FEATURE_ACC_SIZE],
int8_t *tskip_sum_buf, int width, int col_offset, int tskip_lead,
int tskip_lag) {
// SIMD is specifically implemented for pc_wiener block size equal to 4x4 and
// restoration unit size must be 64x64 for luma or 32x32 for chroma plane.
if ((PC_WIENER_BLOCK_SIZE != 4) || ((width != 64) && (width != 32))) {
// TODO(oguleryuz): Reduce the cases where we punt to c-code.
av1_fill_tskip_feature_accumulator_c(tskip_feature_accum, tskip_sum_buf,
width, col_offset, tskip_lead,
tskip_lag);
return;
}
// Process pixel 0 case here.
int col_base = col_offset + tskip_lead;
const int tskip_length = tskip_lead + tskip_lag + 1;
assert(col_base >= 0);
tskip_feature_accum[0] += tskip_sum_buf[col_base];
col_base++;
int cl = col_base - tskip_length;
// Process the remaining width here.
if (width == 64) {
// Processing of the next 32 pixels.
const __m256i src_A =
_mm256_loadu_si256((__m256i const *)(tskip_sum_buf + col_base));
const __m256i src_a =
_mm256_loadu_si256((__m256i const *)(tskip_sum_buf + cl));
__m128i accum_reg = _mm_set1_epi16(tskip_feature_accum[0]);
process_accumulation_wd32(src_A, src_a, accum_reg, &tskip_feature_accum[1]);
// Process the remianing 31 pixels here. Eventhough 31 pixels are processed
// here, load 32 here and make the last 8bit of 256 bit register to zero.
// Load will not overflow as enough elements will be there in the
// tskip_sum_buffer.
col_base += 32;
cl = col_base - tskip_length;
__m256i src_B =
_mm256_loadu_si256((__m256i const *)(tskip_sum_buf + col_base));
__m256i src_b = _mm256_loadu_si256((__m256i const *)(tskip_sum_buf + cl));
const __m256i blend_reg = _mm256_loadu_si256((__m256i *)blend_mask);
const __m256i zero = _mm256_setzero_si256();
src_B = _mm256_blendv_epi8(src_B, zero, blend_reg);
src_b = _mm256_blendv_epi8(src_b, zero, blend_reg);
accum_reg = _mm_set1_epi16(tskip_feature_accum[8]);
process_accumulation_wd32(src_B, src_b, accum_reg, &tskip_feature_accum[9]);
} else if (width == 32) {
// Process the remianing 31 pixels here. Eventhough 31 pixels are processed
// here, load 32 here and make the last 8bit of 256 bit register to zero.
// Load will not overflow as enough elements will be there in the
// tskip_sum_buffer.
__m256i src_A =
_mm256_loadu_si256((__m256i const *)(tskip_sum_buf + col_base));
__m256i src_a = _mm256_loadu_si256((__m256i const *)(tskip_sum_buf + cl));
const __m256i blend_reg = _mm256_loadu_si256((__m256i *)blend_mask);
const __m256i zero = _mm256_setzero_si256();
__m128i accum_reg = _mm_set1_epi16(tskip_feature_accum[0]);
src_A = _mm256_blendv_epi8(src_A, zero, blend_reg);
src_a = _mm256_blendv_epi8(src_a, zero, blend_reg);
process_accumulation_wd32(src_A, src_a, accum_reg, &tskip_feature_accum[1]);
} else {
// For any other case, C support is added. So this assert should not be
// invoked.
assert(0);
}
}