Add AVX2 for mhccp_implicit_fetch_neighbor
This patch includes the following changes.
1. Implementation of AVX2 for mhccp_implicit_fetch_neighbor luma and chroma.
2. Unit test for bit matching between the mhccp_implicit_fetch_neighbor C function and the AVX2 intrinsic function.
The scaling of AVX2 implementation w.r.t. to C are below:
| BW | Scaling Y | Scaling UV |
|--------|-----------|------------|
| 4 | 2.34x | 3.28x |
| 8 | 3.96x | 5.01x |
| 16 | 7.08x | 5.99x |
| 32 | 9.08x | 11.43x |
| 64 | 10.19x | NA |
This is a bit-exact change.
This change provides 0.5% decode time reduction for Key frame for A2 testset tested on commit 249b80e8.
diff --git a/av2/common/av2_rtcd_defs.pl b/av2/common/av2_rtcd_defs.pl
index c1ead36..08f4115 100644
--- a/av2/common/av2_rtcd_defs.pl
+++ b/av2/common/av2_rtcd_defs.pl
@@ -452,6 +452,12 @@
add_proto qw/av2_cfl_subtract_average_fn av2_cfl_get_subtract_average_fn/, "TX_SIZE tx_size";
specialize qw/av2_cfl_get_subtract_average_fn sse2 avx2 neon vsx/;
+add_proto qw/void av2_mhccp_implicit_fetch_neighbor_chroma/, "const uint16_t *dst, int input_stride, TX_SIZE tx_size, int above_lines, int left_lines, int is_top_sb_boundary, int ref_width, int ref_height, uint16_t *output_q3";
+specialize qw/av2_mhccp_implicit_fetch_neighbor_chroma avx2/;
+
+add_proto qw/void av2_mhccp_implicit_fetch_neighbor_luma_420/, "const uint16_t *input, int input_stride, int above_lines, int left_lines, int is_top_sb_boundary, int ref_width, int ref_height, int sub_y, uint8_t cfl_ds_filter_index, int width, int height, uint16_t *output_q3, int output_stride";
+specialize qw/av2_mhccp_implicit_fetch_neighbor_luma_420 avx2/;
+
add_proto qw/av2_cfl_subsample_hbd_fn av2_cfl_get_luma_subsampling_420_hbd/, "TX_SIZE tx_size";
specialize qw/av2_cfl_get_luma_subsampling_420_hbd ssse3 avx2 neon/;
diff --git a/av2/common/reconintra.c b/av2/common/reconintra.c
index e729709..be6528d 100644
--- a/av2/common/reconintra.c
+++ b/av2/common/reconintra.c
@@ -1826,6 +1826,82 @@
}
}
+// Fetch neighboring luma samples for multi hypothesis cross component
+// prediction for 4:2:0 case.
+void av2_mhccp_implicit_fetch_neighbor_luma_420_c(
+ const uint16_t *input, int input_stride, int above_lines, int left_lines,
+ int is_top_sb_boundary, int ref_width, int ref_height, int sub_y,
+ uint8_t cfl_ds_filter_index, int width, int height, uint16_t *output_q3,
+ int output_stride) {
+ for (int h = 0; h < ref_height; h += 2) {
+ for (int w = 0; w < ref_width; w += 2) {
+ const int bot = w + input_stride;
+ if ((h >= above_lines && w >= left_lines + width) ||
+ (h >= above_lines + height && w >= left_lines))
+ continue;
+ // For blocks at the superblock top boundary, we only have one line
+ // above available, therefore we need to offset values for above
+ // region Proposal E229 (for 4:2:0 case) propose to use only 4 lines
+ // and 2 padding lines for the luma reference region, and 2 lines and
+ // 1 padding line for the chroma reference region. Therefore, for
+ // these 2 padding lines above and to the left, we need to offset the
+ // reference region for the top and left boundaries ref_h_t_off is the
+ // position offset value of the top pixel in cfl_ds_filter_index == 2
+ // for both padding and superblock top boundary.
+ int ref_h_t_off = 0;
+ // ref_h_c_off is the position offset value of the pixel in the same
+ // horizontal line of the center pixel for both padding and superblock
+ // top boundary.
+ int ref_h_c_off = 0;
+ // ref_h_b_off is the position offset value of the bottom pixel in
+ // downsample filtering for both padding and superblock top boundary.
+ int ref_h_b_off = 0;
+ // ref_w_off is the position offset value for padding only in the left
+ // and right directions.
+ int ref_w_off = 0;
+ // Preparing the vertical position offset values for superblock top
+ // boundary and padding.
+ if (above_lines == ((LINE_NUM + 1) << sub_y)) {
+ if (is_top_sb_boundary && (h < above_lines)) {
+ // For the top boundary of the superblock, we need to offset the
+ // reference region.
+ ref_h_t_off = h != 0 ? ((LINE_NUM + 1) << sub_y) - h
+ : ((LINE_NUM + 1) << sub_y) - (h + 1);
+ ref_h_c_off = ((LINE_NUM + 1) << sub_y) - (h + 1);
+ ref_h_b_off = ((LINE_NUM + 1) << sub_y) - (h + 2);
+ }
+ }
+ if (cfl_ds_filter_index == 1) {
+ output_q3[w >> 1] =
+ input[AVMMAX(0, w - 1) + ref_w_off + ref_h_c_off * input_stride] +
+ 2 * input[w + ref_w_off + ref_h_c_off * input_stride] +
+ input[w + 1 + ref_w_off + ref_h_c_off * input_stride] +
+ input[bot + AVMMAX(-1, -w) + ref_w_off +
+ ref_h_b_off * input_stride] +
+ 2 * input[bot + ref_w_off + ref_h_b_off * input_stride] +
+ input[bot + 1 + ref_w_off + ref_h_b_off * input_stride];
+ } else if (cfl_ds_filter_index == 2) {
+ const int top = h != 0 ? w - input_stride : w;
+ output_q3[w >> 1] =
+ input[AVMMAX(0, w - 1) + ref_w_off + ref_h_c_off * input_stride] +
+ 4 * input[w + ref_w_off + ref_h_c_off * input_stride] +
+ input[w + 1 + ref_w_off + ref_h_c_off * input_stride] +
+ input[top + ref_w_off + ref_h_t_off * input_stride] +
+ input[bot + ref_w_off + ref_h_b_off * input_stride];
+ } else {
+ output_q3[w >> 1] =
+ (input[w + ref_w_off + ref_h_c_off * input_stride] +
+ input[w + 1 + ref_w_off + ref_h_c_off * input_stride] +
+ input[bot + ref_w_off + ref_h_b_off * input_stride] +
+ input[bot + 1 + ref_w_off + ref_h_b_off * input_stride])
+ << 1;
+ }
+ }
+ output_q3 += output_stride;
+ input += (input_stride << 1);
+ }
+}
+
void mhccp_implicit_fetch_neighbor_luma(const AV2_COMMON *cm,
MACROBLOCKD *const xd, int row, int col,
TX_SIZE tx_size, int *above_lines,
@@ -1950,76 +2026,10 @@
input = input - (*above_lines) * input_stride - *left_lines;
if ((*above_lines) || (*left_lines)) {
if (sub_x && sub_y) {
- for (int h = 0; h < (*ref_height); h += 2) {
- for (int w = 0; w < (*ref_width); w += 2) {
- const int bot = w + input_stride;
- if ((h >= *above_lines && w >= *left_lines + width) ||
- (h >= *above_lines + height && w >= *left_lines))
- continue;
- // For blocks at the superblock top boundary, we only have one line
- // above available, therefore we need to offset values for above
- // region Proposal E229 (for 4:2:0 case) propose to use only 4 lines
- // and 2 padding lines for the luma reference region, and 2 lines and
- // 1 padding line for the chroma reference region. Therefore, for
- // these 2 padding lines above and to the left, we need to offset the
- // reference region for the top and left boundaries ref_h_t_off is the
- // position offset value of the top pixel in cfl_ds_filter_index == 2
- // for both padding and superblock top boundary
- int ref_h_t_off = 0;
- // ref_h_c_off is the position offset value of the pixel in the same
- // horizontal line of the center pixel for both padding and superblock
- // top boundary
- int ref_h_c_off = 0;
- // ref_h_b_off is the position offset value of the bottom pixel in
- // downsample filtering for both padding and superblock top boundary
- int ref_h_b_off = 0;
- // ref_w_off is the position offset value for padding only in the left
- // and right directions
- int ref_w_off = 0;
- // Preparing the vertical position offset values for superblock top
- // boundary and padding
- if (*above_lines == ((LINE_NUM + 1) << sub_y)) {
- if (is_top_sb_boundary && (h < *above_lines)) {
- // For the top boundary of the superblock, we need to offset the
- // reference region
- ref_h_t_off = h != 0 ? ((LINE_NUM + 1) << sub_y) - h
- : ((LINE_NUM + 1) << sub_y) - (h + 1);
- ref_h_c_off = ((LINE_NUM + 1) << sub_y) - (h + 1);
- ref_h_b_off = ((LINE_NUM + 1) << sub_y) - (h + 2);
- }
- }
- if (cm->seq_params.cfl_ds_filter_index == 1) {
- output_q3[w >> 1] =
- input[AVMMAX(0, w - 1) + ref_w_off +
- ref_h_c_off * input_stride] +
- 2 * input[w + ref_w_off + ref_h_c_off * input_stride] +
- input[w + 1 + ref_w_off + ref_h_c_off * input_stride] +
- input[bot + AVMMAX(-1, -w) + ref_w_off +
- ref_h_b_off * input_stride] +
- 2 * input[bot + ref_w_off + ref_h_b_off * input_stride] +
- input[bot + 1 + ref_w_off + ref_h_b_off * input_stride];
- } else if (cm->seq_params.cfl_ds_filter_index == 2) {
- const int top = h != 0 ? w - input_stride : w;
- output_q3[w >> 1] =
- input[AVMMAX(0, w - 1) + ref_w_off +
- ref_h_c_off * input_stride] +
- 4 * input[w + ref_w_off + ref_h_c_off * input_stride] +
- input[w + 1 + ref_w_off + ref_h_c_off * input_stride] +
- input[top + ref_w_off + ref_h_t_off * input_stride] +
- input[bot + ref_w_off + ref_h_b_off * input_stride];
- } else {
- output_q3[w >> 1] =
- (input[w + ref_w_off + ref_h_c_off * input_stride] +
- input[w + 1 + ref_w_off + ref_h_c_off * input_stride] +
- input[bot + ref_w_off + ref_h_b_off * input_stride] +
- input[bot + 1 + ref_w_off + ref_h_b_off * input_stride])
- << 1;
- }
- }
- output_q3 += output_stride;
- input += (input_stride << 1);
- }
-
+ av2_mhccp_implicit_fetch_neighbor_luma_420(
+ input, input_stride, *above_lines, *left_lines, is_top_sb_boundary,
+ *ref_width, *ref_height, sub_y, cm->seq_params.cfl_ds_filter_index,
+ width, height, output_q3, output_stride);
} else if (sub_x) {
for (int h = 0; h < (*ref_height); h++) {
for (int i = 0; i < (*ref_width); i += 2) {
@@ -2096,22 +2106,16 @@
}
}
-void mhccp_implicit_fetch_neighbor_chroma(MACROBLOCKD *const xd, int plane,
- int row, int col, TX_SIZE tx_size,
- int above_lines, int left_lines,
- int is_top_sb_boundary, int ref_width,
- int ref_height) {
- CFL_CTX *const cfl = &xd->cfl;
- struct macroblockd_plane *const pd = &xd->plane[plane];
- int input_stride = pd->dst.stride;
- uint16_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2];
-
+// Fetch neighboring chroma samples for multi hypothesis cross component
+// prediction.
+void av2_mhccp_implicit_fetch_neighbor_chroma_c(
+ const uint16_t *input, int input_stride, TX_SIZE tx_size, int above_lines,
+ int left_lines, int is_top_sb_boundary, int ref_width, int ref_height,
+ uint16_t *output_q3) {
const int width = tx_size_wide[tx_size];
const int height = tx_size_high[tx_size];
- uint16_t *output_q3 = cfl->mhccp_ref_buf_q3[plane];
int output_stride = CFL_BUF_LINE * 2;
- uint16_t *input = dst - above_lines * input_stride - left_lines;
if (above_lines || left_lines) {
for (int h = 0; h < ref_height; ++h) {
for (int w = 0; w < ref_width; ++w) {
@@ -2238,9 +2242,12 @@
left_lines >>= sub_x;
ref_width >>= sub_x;
ref_height >>= sub_y;
- mhccp_implicit_fetch_neighbor_chroma(
- xd, plane, blk_row, blk_col, tx_size, above_lines, left_lines,
- is_top_sb_boundary, ref_width, ref_height);
+ uint16_t *output_q3 = cfl->mhccp_ref_buf_q3[plane];
+ uint16_t *input = dst - above_lines * dst_stride - left_lines;
+
+ av2_mhccp_implicit_fetch_neighbor_chroma(
+ input, dst_stride, tx_size, above_lines, left_lines,
+ is_top_sb_boundary, ref_width, ref_height, output_q3);
av2_mhccp_derive_multi_param_hv(xd, plane, above_lines, left_lines,
ref_width, ref_height, mbmi->mh_dir,
is_top_sb_boundary);
diff --git a/av2/common/reconintra.h b/av2/common/reconintra.h
index 1205efe..4c03af1 100644
--- a/av2/common/reconintra.h
+++ b/av2/common/reconintra.h
@@ -263,13 +263,6 @@
TX_SIZE tx_size, int *above_lines,
int *left_lines, int is_top_sb_boundary,
int *ref_width, int *ref_height);
-// fetch neighboring chroma samples for multi hypothesis cross component
-// prediction
-void mhccp_implicit_fetch_neighbor_chroma(MACROBLOCKD *const xd, int plane,
- int row, int col, TX_SIZE tx_size,
- int above_lines, int left_lines,
- int is_top_sb_boundary, int ref_width,
- int ref_height);
static AVM_INLINE void set_have_top_and_left(int *have_top, int *have_left,
const MACROBLOCKD *xd, int row_off,
diff --git a/av2/common/x86/cfl_avx2.c b/av2/common/x86/cfl_avx2.c
index 5835dda..b97fea5 100644
--- a/av2/common/x86/cfl_avx2.c
+++ b/av2/common/x86/cfl_avx2.c
@@ -1823,3 +1823,211 @@
1 << MHCCP_DECIM_BITS;
}
}
+
+// AVX2 implementation for av2_mhccp_implicit_fetch_neighbor_chroma_c
+void av2_mhccp_implicit_fetch_neighbor_chroma_avx2(
+ const uint16_t *input, int input_stride, TX_SIZE tx_size, int above_lines,
+ int left_lines, int is_top_sb_boundary, int ref_width, int ref_height,
+ uint16_t *output_q3) {
+ (void)is_top_sb_boundary;
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+
+ int output_stride = CFL_BUF_LINE * 2;
+ if (above_lines || left_lines) {
+ for (int h = 0; h < ref_height; ++h) {
+ assert(LINE_NUM == 1);
+ int ref_h_offset = (above_lines == (LINE_NUM + 1) && h == 0) ? 1 : 0;
+ int h_offset_stride = ref_h_offset * input_stride;
+ int w = 0;
+ for (; w < ref_width; w += 16) {
+ if ((h >= above_lines && w >= left_lines + width) ||
+ (h >= above_lines + height && w >= left_lines))
+ continue;
+ __m256i l_vec =
+ _mm256_loadu_si256((const __m256i *)(input + h_offset_stride + w));
+ if (left_lines == (LINE_NUM + 1) && (w == 0)) {
+ __m128i lo = _mm256_castsi256_si128(l_vec);
+ lo = _mm_shufflelo_epi16(lo, _MM_SHUFFLE(3, 2, 1, 1));
+ l_vec = _mm256_inserti128_si256(l_vec, lo, 0);
+ }
+ _mm256_storeu_si256((__m256i *)(output_q3 + w), l_vec);
+ }
+ output_q3 += output_stride;
+ input += input_stride;
+ }
+ }
+}
+
+DECLARE_ALIGNED(32, static const uint8_t,
+ shuffle_index[32]) = { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4,
+ 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9,
+ 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
+DECLARE_ALIGNED(32, static const uint16_t,
+ blend_index[16]) = { 0xFFFF, 0, 0, 0, 0, 0, 0, 0,
+ 0xFFFF, 0, 0, 0, 0, 0, 0, 0 };
+
+// AVX2 implementation for av2_mhccp_implicit_fetch_neighbor_luma_420_c
+void av2_mhccp_implicit_fetch_neighbor_luma_420_avx2(
+ const uint16_t *input, int input_stride, int above_lines, int left_lines,
+ int is_top_sb_boundary, int ref_width, int ref_height, int sub_y,
+ uint8_t cfl_ds_filter_index, int width, int height, uint16_t *output_q3,
+ int output_stride) {
+ for (int h = 0; h < ref_height; h += 2) {
+ int ref_h_t_off = 0;
+ int ref_h_c_off = 0;
+ int ref_h_b_off = 0;
+ int ref_w_off = 0;
+ if (above_lines == ((LINE_NUM + 1) << sub_y)) {
+ if (is_top_sb_boundary && (h < above_lines)) {
+ // For the top boundary of the superblock, we need to offset the
+ // reference region
+ ref_h_t_off = h != 0 ? ((LINE_NUM + 1) << sub_y) - h
+ : ((LINE_NUM + 1) << sub_y) - (h + 1);
+ ref_h_c_off = ((LINE_NUM + 1) << sub_y) - (h + 1);
+ ref_h_b_off = ((LINE_NUM + 1) << sub_y) - (h + 2);
+ }
+ }
+ int h_c_offset_stride = ref_h_c_off * input_stride;
+ int h_b_offset_stride = ref_h_b_off * input_stride + input_stride;
+ for (int w = 0; w < ref_width; w += 32) {
+ if ((h >= above_lines && w >= left_lines + width) ||
+ (h >= above_lines + height && w >= left_lines))
+ continue;
+
+ // h_c0, h_c1, ... h_c15
+ __m256i h_c0 =
+ _mm256_loadu_si256((const __m256i *)(input + h_c_offset_stride + w));
+
+ // h_c16, h_c17, ... h_c31
+ __m256i h_c16 = _mm256_loadu_si256(
+ (const __m256i *)(input + h_c_offset_stride + w + 16));
+
+ // h_b0, h_b1, ..... h_b15
+ __m256i h_b0 =
+ _mm256_loadu_si256((const __m256i *)(input + h_b_offset_stride + w));
+
+ // h_b16, h_b17, ..... h_b31
+ __m256i h_b16 = _mm256_loadu_si256(
+ (const __m256i *)(input + h_b_offset_stride + w + 16));
+
+ if (cfl_ds_filter_index == 1 || cfl_ds_filter_index == 2) {
+ __m256i blend_mask = _mm256_load_si256((__m256i *)blend_index);
+ __m256i h_cm1;
+ if (w > 0) {
+ h_cm1 = _mm256_loadu_si256(
+ (const __m256i *)(input + h_c_offset_stride + ref_w_off + w - 1));
+ } else {
+ // h_c7, h_c0, h_c1, h_c2, h_c3, h_c4, h_c5, h_c6, h_c15, h_c8, h_c9,
+ // h_c10, h_c11, h_c12, h_c13, h_c14
+ __m256i w_reorder1 = _mm256_alignr_epi8(h_c0, h_c0, 14);
+ // h_c0, h_c1, h_c2, h_c3, h_c4, h_c5, h_c6, h_c7, h_c7, h_c0, h_c1,
+ // h_c2, h_c3, h_c4, h_c5, h_c6
+ __m256i w_reorder2 =
+ _mm256_permute2x128_si256(h_c0, w_reorder1, 0x20);
+ // h_c0, h_c0, h_c1, h_c2, h_c3, h_c4, h_c5, h_c6, h_c7, h_c8, h_c9,
+ // h_c10, h_c11, h_c12, h_c13, h_c14
+ h_cm1 = _mm256_blendv_epi8(w_reorder1, w_reorder2, blend_mask);
+ }
+
+ // h_c15, h_c16, ... h_c30
+ __m256i h_c15 = _mm256_loadu_si256(
+ (const __m256i *)(input + h_c_offset_stride + w + 15));
+
+ if (cfl_ds_filter_index == 1) {
+ __m256i h_bm1;
+ if (w > 0) {
+ h_bm1 =
+ _mm256_loadu_si256((const __m256i *)(input + h_b_offset_stride +
+ ref_w_off + w - 1));
+ } else {
+ // h_b7, h_b0, h_b1, h_b2, h_b3, h_b4, h_b5, h_b6, h_b15, h_b8,
+ // h_b9, h_b10, h_b11, h_b12, h_b13, h_b14
+ __m256i b_reorder1 = _mm256_alignr_epi8(h_b0, h_b0, 14);
+ // h_b0, h_b1, h_b2, h_b3, h_b4, h_b5, h_b6, h_b7, h_b7, h_b0, h_b1,
+ // h_b2, h_b3, h_b4, h_b5, h_b6
+ __m256i b_reorder2 =
+ _mm256_permute2x128_si256(h_b0, b_reorder1, 0x20);
+
+ // h_b0, h_b0, h_b1, h_b2, h_b3, h_b4, h_b5, h_b6, h_b7, h_b8, h_b9,
+ // h_b10, h_b11, h_b12, h_b13, h_b14
+ h_bm1 = _mm256_blendv_epi8(b_reorder1, b_reorder2, blend_mask);
+ }
+
+ // h_b15, h_b16, ..... h_b30
+ __m256i h_b15 = _mm256_loadu_si256(
+ (const __m256i *)(input + h_b_offset_stride + w + 15));
+
+ __m256i h_cm1_c0 = _mm256_add_epi16(h_cm1, h_c0);
+ __m256i h_bm1_b0 = _mm256_add_epi16(h_bm1, h_b0);
+ __m256i h_c15_c16 = _mm256_add_epi16(h_c15, h_c16);
+ __m256i h_b15_b16 = _mm256_add_epi16(h_b15, h_b16);
+
+ __m256i h_cm1_c0_bm1_b0 = _mm256_add_epi16(h_cm1_c0, h_bm1_b0);
+ __m256i h_c15_c16_b15_b16 = _mm256_add_epi16(h_c15_c16, h_b15_b16);
+
+ __m256i res = _mm256_hadd_epi16(h_cm1_c0_bm1_b0, h_c15_c16_b15_b16);
+
+ res = _mm256_permute4x64_epi64(res, 0xD8);
+
+ _mm256_storeu_si256((__m256i *)(output_q3 + (w >> 1)), res);
+ } else if (cfl_ds_filter_index == 2) {
+ const int top = h != 0 ? (-input_stride) : 0;
+ int h_t_offset_stride = top + ref_h_t_off * input_stride;
+
+ __m256i h_t0 = _mm256_loadu_si256(
+ (const __m256i *)(input + h_t_offset_stride + w));
+ __m256i h_t16 = _mm256_loadu_si256(
+ (const __m256i *)(input + h_t_offset_stride + w + 16));
+
+ const __m256i shuffle_mask =
+ _mm256_load_si256((__m256i *)shuffle_index);
+ // h_c0, h_c2, h_c4, ....
+ __m256i h_c0_even = _mm256_shuffle_epi8(h_c0, shuffle_mask);
+ // h_b0, h_b2, h_b4, ....
+ __m256i h_b0_even = _mm256_shuffle_epi8(h_b0, shuffle_mask);
+ __m256i b0_c0_even = _mm256_unpacklo_epi16(h_b0_even, h_c0_even);
+ // h_t0, h_t2, h_t4, ....
+ __m256i h_t0_even = _mm256_shuffle_epi8(h_t0, shuffle_mask);
+ __m256i t0_c0_even = _mm256_unpacklo_epi16(h_t0_even, h_c0_even);
+
+ // h_c16, h_c18, h_c20, ....
+ __m256i h_c16_even = _mm256_shuffle_epi8(h_c16, shuffle_mask);
+ // h_b16, h_b18, h_b20, ....
+ __m256i h_b16_even = _mm256_shuffle_epi8(h_b16, shuffle_mask);
+ __m256i b16_c16_even = _mm256_unpacklo_epi16(h_b16_even, h_c16_even);
+ // h_t16, h_t18, h_t20, ....
+ __m256i h_t16_even = _mm256_shuffle_epi8(h_t16, shuffle_mask);
+ __m256i t16_c16_even = _mm256_unpacklo_epi16(h_t16_even, h_c16_even);
+
+ __m256i h_cm1_c0 = _mm256_add_epi16(h_cm1, h_c0);
+ __m256i bc0_tc0_even = _mm256_add_epi16(b0_c0_even, t0_c0_even);
+ __m256i h_c15_c16 = _mm256_add_epi16(h_c15, h_c16);
+ __m256i bc16_tc16_even = _mm256_add_epi16(b16_c16_even, t16_c16_even);
+
+ __m256i h_cm1_c0_bc0_tc0_even =
+ _mm256_add_epi16(h_cm1_c0, bc0_tc0_even);
+ __m256i h_c15_c16_bc16_tc16_even =
+ _mm256_add_epi16(h_c15_c16, bc16_tc16_even);
+
+ __m256i res = _mm256_hadd_epi16(h_cm1_c0_bc0_tc0_even,
+ h_c15_c16_bc16_tc16_even);
+
+ res = _mm256_permute4x64_epi64(res, 0xD8);
+ _mm256_storeu_si256((__m256i *)(output_q3 + (w >> 1)), res);
+ }
+ } else {
+ __m256i h_c0_b0 = _mm256_add_epi16(h_c0, h_b0);
+ __m256i h_c16_b16 = _mm256_add_epi16(h_c16, h_b16);
+
+ __m256i res = _mm256_hadd_epi16(h_c0_b0, h_c16_b16);
+ res = _mm256_permute4x64_epi64(res, 0xD8);
+ res = _mm256_slli_epi16(res, 1);
+
+ _mm256_storeu_si256((__m256i *)(output_q3 + (w >> 1)), res);
+ }
+ }
+ output_q3 += output_stride;
+ input += (input_stride << 1);
+ }
+}
diff --git a/test/cfl_test.cc b/test/cfl_test.cc
index 452291d..11ea695 100644
--- a/test/cfl_test.cc
+++ b/test/cfl_test.cc
@@ -883,4 +883,283 @@
INSTANTIATE_TEST_SUITE_P(VSX, CFLSubAvgTest,
::testing::ValuesIn(sub_avg_sizes_vsx));
#endif
+
+#if HAVE_AVX2
+std::array<std::array<int, 2>, 3> chroma_above_left_line_pairs = {
+ { { 0, 2 }, { 2, 0 }, { 2, 2 } }
+};
+
+std::array<std::array<int, 2>, 3> luma_above_left_line_pairs = {
+ { { 0, 4 }, { 4, 0 }, { 4, 4 } }
+};
+#endif // HAVE_AVX2
+
+const int chroma_tx_size[] = { 0, 1, 2, 3, 5, 6, 7, 8,
+ 9, 10, 13, 14, 15, 16, 19, 20 };
+
+typedef void (*av2_mhccp_implicit_fetch_neighbor_chroma_fn)(
+ const uint16_t *dst, int input_stride, TX_SIZE tx_size, int above_lines,
+ int left_lines, int is_top_sb_boundary, int ref_width, int ref_height,
+ uint16_t *output_q3);
+
+typedef std::tuple<std::array<int, 2>, int, int,
+ av2_mhccp_implicit_fetch_neighbor_chroma_fn>
+ mhccp_derive_param_chroma;
+
+class MhccpImplicitFetchNeighbourChromaTest
+ : public ::testing::TestWithParam<mhccp_derive_param_chroma> {
+ public:
+ void SetUp() override {
+ std::array<int, 2> arrlin = std::get<0>(GetParam());
+ above_lines_ = arrlin[0];
+ left_lines_ = arrlin[1];
+ tx_size_ = std::get<1>(GetParam());
+ bd_ = std::get<2>(GetParam());
+
+ tgt_fn_ = std::get<3>(GetParam());
+ ref_fn_ = av2_mhccp_implicit_fetch_neighbor_chroma_c;
+
+ ref_width_ = AVMMIN((tx_size_wide[tx_size_] + left_lines_), 128);
+ ref_height_ = AVMMIN((tx_size_high[tx_size_] + above_lines_), 128);
+
+ initData();
+
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ }
+
+ protected:
+ av2_mhccp_implicit_fetch_neighbor_chroma_fn tgt_fn_;
+ av2_mhccp_implicit_fetch_neighbor_chroma_fn ref_fn_;
+ int above_lines_, left_lines_, ref_width_, ref_height_, bd_;
+ TX_SIZE tx_size_;
+ uint16_t *output_q3_ref_, *output_q3_tgt_, *input_;
+ ACMRandom rnd_;
+
+ void initData() {
+ int num_w, num_h;
+ num_w = num_h = (CFL_BUF_LINE << 1);
+ const uint16_t mask = (1 << bd_) - 1;
+ input_ = (uint16_t *)calloc(1, sizeof(uint16_t) * (num_h * num_w));
+ for (int i = 0; i < num_h; ++i) {
+ for (int j = 0; j < num_w; ++j) {
+ uint16_t val = this->rnd_.Rand16() & mask;
+ input_[i * num_w + j] = val;
+ }
+ }
+ output_q3_ref_ = (uint16_t *)calloc(1, sizeof(uint16_t) * (num_h * num_w));
+ output_q3_tgt_ = (uint16_t *)calloc(1, sizeof(uint16_t) * (num_h * num_w));
+ }
+
+ void TearDown() override {
+ free(input_);
+ free(output_q3_ref_);
+ free(output_q3_tgt_);
+ }
+
+ void assertMhccpChromaEqParams(int above_l, int left_l, int width, int height,
+ int ref_w, int ref_h, int stride) {
+ uint16_t *ref_ptr = output_q3_ref_;
+ uint16_t *tgt_ptr = output_q3_tgt_;
+ for (int i = 0; i < ref_h; ++i) {
+ for (int j = 0; j < ref_w; ++j) {
+ if ((i >= above_l && j >= left_l + width) ||
+ (i >= above_l + height && j >= left_l))
+ continue;
+ const uint16_t ref_val = ref_ptr[i * stride + j];
+ const uint16_t tgt_val = tgt_ptr[i * stride + j];
+ ASSERT_EQ(ref_val, tgt_val)
+ << "Mismatch at index[" << (i * stride + j) << "] ref=" << ref_val
+ << " tgt=" << tgt_val;
+ }
+ }
+ }
+};
+
+TEST_P(MhccpImplicitFetchNeighbourChromaTest, CompareCAndAVX2) {
+ const int is_top_sb_boundary = rnd_(2);
+ int dst_stride_ = CFL_BUF_LINE << 1;
+ ref_fn_(input_, dst_stride_, tx_size_, above_lines_, left_lines_,
+ is_top_sb_boundary, ref_width_, ref_height_, output_q3_ref_);
+
+ tgt_fn_(input_, dst_stride_, tx_size_, above_lines_, left_lines_,
+ is_top_sb_boundary, ref_width_, ref_height_, output_q3_tgt_);
+
+ assertMhccpChromaEqParams(above_lines_, left_lines_, tx_size_wide[tx_size_],
+ tx_size_high[tx_size_], ref_width_, ref_height_,
+ dst_stride_);
+}
+
+TEST_P(MhccpImplicitFetchNeighbourChromaTest, DISABLED_SpeedTest) {
+ const int is_top_sb_boundary = 0;
+ avm_usec_timer ref_timer, tgt_timer;
+ int dst_stride_ = CFL_BUF_LINE << 1;
+
+ avm_usec_timer_start(&ref_timer);
+ for (int i = 0; i < NUM_ITERATIONS_SPEED; ++i) {
+ ref_fn_(input_, dst_stride_, tx_size_, above_lines_, left_lines_,
+ is_top_sb_boundary, ref_width_, ref_height_, output_q3_ref_);
+ }
+ avm_usec_timer_mark(&ref_timer);
+ const int ref_time = (int)avm_usec_timer_elapsed(&ref_timer);
+
+ avm_usec_timer_start(&tgt_timer);
+ for (int i = 0; i < NUM_ITERATIONS_SPEED; ++i) {
+ tgt_fn_(input_, dst_stride_, tx_size_, above_lines_, left_lines_,
+ is_top_sb_boundary, ref_width_, ref_height_, output_q3_tgt_);
+ }
+ avm_usec_timer_mark(&tgt_timer);
+ const int tgt_time = (int)avm_usec_timer_elapsed(&tgt_timer);
+
+ printSpeed(ref_time, tgt_time, tx_size_wide[tx_size_],
+ tx_size_high[tx_size_]);
+ assertFaster(ref_time, tgt_time);
+}
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, MhccpImplicitFetchNeighbourChromaTest,
+ ::testing::Combine(
+ ::testing::ValuesIn(chroma_above_left_line_pairs),
+ ::testing::ValuesIn(chroma_tx_size), // tx_size
+ ::testing::Values(8, 10, 12), // bd
+ ::testing::Values(av2_mhccp_implicit_fetch_neighbor_chroma_avx2)));
+#endif // HAVE_AVX2
+
+typedef void (*av2_mhccp_implicit_fetch_neighbor_luma_420_fn)(
+ const uint16_t *input, int input_stride, int above_lines, int left_lines,
+ int is_top_sb_boundary, int ref_width, int ref_height, int sub_y,
+ uint8_t cfl_ds_filter_index, int width, int height, uint16_t *output_q3,
+ int output_stride);
+
+typedef std::tuple<std::array<int, 2>, int, int, int, int,
+ av2_mhccp_implicit_fetch_neighbor_luma_420_fn>
+ mhccp_derive_param_luma;
+
+class MhccpImplicitFetchNeighbourLumaTest
+ : public ::testing::TestWithParam<mhccp_derive_param_luma> {
+ public:
+ void SetUp() override {
+ std::array<int, 2> arrlin = std::get<0>(GetParam());
+ above_lines_ = arrlin[0];
+ left_lines_ = arrlin[1];
+ cfl_ds_filter_index_ = std::get<1>(GetParam());
+ width_ = std::get<2>(GetParam()) << 1;
+ height_ = std::get<3>(GetParam()) << 1;
+ bd_ = std::get<4>(GetParam());
+
+ tgt_fn_ = std::get<5>(GetParam());
+ ref_fn_ = av2_mhccp_implicit_fetch_neighbor_luma_420_c;
+
+ initData();
+
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ }
+
+ protected:
+ av2_mhccp_implicit_fetch_neighbor_luma_420_fn tgt_fn_;
+ av2_mhccp_implicit_fetch_neighbor_luma_420_fn ref_fn_;
+ int above_lines_, left_lines_, ref_width_, ref_height_, bd_;
+ int sub_y_, input_stride_, output_stride_;
+ int width_, height_;
+ uint16_t *output_q3_ref_, *output_q3_tgt_, *input_;
+ uint8_t cfl_ds_filter_index_;
+ ACMRandom rnd_;
+
+ void initData() {
+ int num_w, num_h;
+ sub_y_ = 1;
+ input_stride_ = output_stride_ = (CFL_BUF_LINE << 1);
+ ref_width_ = AVMMIN(left_lines_ + width_, 128);
+ ref_height_ = AVMMIN(above_lines_ + height_, 128);
+ num_w = num_h = (CFL_BUF_LINE << 1);
+ const uint16_t mask = (1 << bd_) - 1;
+ input_ = (uint16_t *)calloc(1, sizeof(uint16_t) * (num_h * num_w));
+ for (int i = 0; i < num_h; ++i) {
+ for (int j = 0; j < num_w; ++j) {
+ uint16_t val = this->rnd_.Rand16() & mask;
+ input_[i * num_w + j] = val;
+ }
+ }
+ output_q3_ref_ = (uint16_t *)calloc(1, sizeof(uint16_t) * (num_h * num_w));
+ output_q3_tgt_ = (uint16_t *)calloc(1, sizeof(uint16_t) * (num_h * num_w));
+ }
+
+ void TearDown() override {
+ free(input_);
+ free(output_q3_ref_);
+ free(output_q3_tgt_);
+ }
+
+ void assertMhccpLumaEqParams(int above_l, int left_l, int width, int height,
+ int ref_w, int ref_h) {
+ uint16_t *ref_ptr = output_q3_ref_;
+ uint16_t *tgt_ptr = output_q3_tgt_;
+ for (int i = 0; i < ref_h; i += 2) {
+ for (int j = 0; j < ref_w; j += 2) {
+ if ((i >= above_l && j >= left_l + width) ||
+ (i >= above_l + height && j >= left_l))
+ continue;
+ const uint16_t ref_val = ref_ptr[(i >> 1) * output_stride_ + (j >> 1)];
+ const uint16_t tgt_val = tgt_ptr[(i >> 1) * output_stride_ + (j >> 1)];
+ ASSERT_EQ(ref_val, tgt_val)
+ << "Mismatch at index[" << ((i >> 1) * output_stride_ + (j >> 1))
+ << "] ref=" << ref_val << " tgt=" << tgt_val;
+ }
+ }
+ }
+};
+
+TEST_P(MhccpImplicitFetchNeighbourLumaTest, CompareCAndAVX2) {
+ const int is_top_sb_boundary = rnd_(2);
+ ref_fn_(input_, input_stride_, above_lines_, left_lines_, is_top_sb_boundary,
+ ref_width_, ref_height_, sub_y_, cfl_ds_filter_index_, width_,
+ height_, output_q3_ref_, output_stride_);
+
+ tgt_fn_(input_, input_stride_, above_lines_, left_lines_, is_top_sb_boundary,
+ ref_width_, ref_height_, sub_y_, cfl_ds_filter_index_, width_,
+ height_, output_q3_tgt_, output_stride_);
+
+ assertMhccpLumaEqParams(above_lines_, left_lines_, width_, height_,
+ ref_width_, ref_height_);
+}
+
+TEST_P(MhccpImplicitFetchNeighbourLumaTest, DISABLED_SpeedTest) {
+ const int is_top_sb_boundary = 0;
+ avm_usec_timer ref_timer, tgt_timer;
+ avm_usec_timer_start(&ref_timer);
+ for (int i = 0; i < NUM_ITERATIONS_SPEED; ++i) {
+ ref_fn_(input_, input_stride_, above_lines_, left_lines_,
+ is_top_sb_boundary, ref_width_, ref_height_, sub_y_,
+ cfl_ds_filter_index_, width_, height_, output_q3_ref_,
+ output_stride_);
+ }
+ avm_usec_timer_mark(&ref_timer);
+ const int ref_time = (int)avm_usec_timer_elapsed(&ref_timer);
+
+ avm_usec_timer_start(&tgt_timer);
+ for (int i = 0; i < NUM_ITERATIONS_SPEED; ++i) {
+ tgt_fn_(input_, input_stride_, above_lines_, left_lines_,
+ is_top_sb_boundary, ref_width_, ref_height_, sub_y_,
+ cfl_ds_filter_index_, width_, height_, output_q3_tgt_,
+ output_stride_);
+ }
+ avm_usec_timer_mark(&tgt_timer);
+ const int tgt_time = (int)avm_usec_timer_elapsed(&tgt_timer);
+
+ printSpeed(ref_time, tgt_time, (width_ >> 1), (height_ >> 1));
+ assertFaster(ref_time, tgt_time);
+}
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, MhccpImplicitFetchNeighbourLumaTest,
+ ::testing::Combine(
+ ::testing::ValuesIn(luma_above_left_line_pairs),
+ ::testing::Values(0, 1, 2), // cfl_ds_filter_index_
+ ::testing::Values(4, 8, 16, 32, 64), // width_
+ ::testing::Values(4, 8, 16, 32, 64), // height_
+ ::testing::Values(8, 10, 12), // bd
+ ::testing::Values(av2_mhccp_implicit_fetch_neighbor_luma_420_avx2)));
+#endif // HAVE_AVX2
+
} // namespace