Temporal filter improvement The hierarchical TF block sizes are added for better temporal filter performance. For HD set, this gives coding gain: -0.4% SSIM, -0.07% PSNR, -0.21% VMAF/VMAF_neg. SIMD for 64x64 block TF function will be added. STATS_CHANGED Change-Id: Id7628b6105f19fff0ac16491ad36b9ea5009df01
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c index 4437110..58dbf70 100644 --- a/av1/encoder/temporal_filter.c +++ b/av1/encoder/temporal_filter.c
@@ -48,8 +48,90 @@ // Forward Declaration. static void tf_determine_block_partition(const MV block_mv, const int block_mse, + const MV *midblock_mvs, + const int *midblock_mses, MV *subblock_mvs, int *subblock_mses); +// Do motion search for 1 subblock. The block size can be 32x32 or 16x16. +static void subblock_motion_search( + AV1_COMP *cpi, MACROBLOCK *mb, const YV12_BUFFER_CONFIG *frame_to_filter, + const YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE tf_block_size, int mb_row, + int mb_col, BLOCK_SIZE subblock_size, int idx, int subblock_ofst_i, + int subblock_ofst_j, FULLPEL_MOTION_SEARCH_PARAMS *full_ms_params, + SUBPEL_MOTION_SEARCH_PARAMS *ms_params, const MV *ref_mv, + FULLPEL_MV start_mv, const search_site_config *search_site_cfg, + SEARCH_METHODS search_method, MV_COST_TYPE mv_cost_type, int step_param, + SUBPEL_SEARCH_TYPE subpel_ms_type, MV *subblock_mvs, int *subblock_mses, + FULLPEL_MV_STATS *best_mv_stats, int q) { + MACROBLOCKD *const mbd = &mb->e_mbd; + + const int mb_height = block_size_high[tf_block_size]; + const int mb_width = block_size_wide[tf_block_size]; + const int mi_h = mi_size_high_log2[tf_block_size]; + const int mi_w = mi_size_wide_log2[tf_block_size]; + + const int y_stride = frame_to_filter->y_stride; + assert(y_stride == ref_frame->y_stride); + const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width; + + const int subblock_height = block_size_high[subblock_size]; + const int subblock_width = block_size_wide[subblock_size]; + const int subblock_pels = subblock_height * subblock_width; + + int_mv best_mv; // Searched motion vector. + unsigned int sse, error; + int distortion; + int cost_list[5]; + + av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits, + (mb_row << mi_h) + (subblock_ofst_i >> MI_SIZE_LOG2), + (subblock_height >> MI_SIZE_LOG2), + cpi->oxcf.border_in_pixels); + av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits, + (mb_col << mi_w) + (subblock_ofst_j >> MI_SIZE_LOG2), + (subblock_width >> MI_SIZE_LOG2), + cpi->oxcf.border_in_pixels); + const int boffset = subblock_ofst_i * y_stride + subblock_ofst_j; + mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + boffset; + mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + boffset; + av1_make_default_fullpel_ms_params(full_ms_params, cpi, mb, subblock_size, + ref_mv, start_mv, search_site_cfg, + search_method, + /*fine_search_interval=*/0); + full_ms_params->run_mesh_search = 1; + full_ms_params->mv_cost_params.mv_cost_type = mv_cost_type; + + // May need to re-tune prune_mesh_search after increase TF + // block to 64x64. + if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) { + // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1. + full_ms_params->prune_mesh_search = (q <= 20) ? 0 : 1; + full_ms_params->mesh_search_mv_diff_threshold = 2; + } + + av1_full_pixel_search(start_mv, full_ms_params, step_param, + cond_cost_list(cpi, cost_list), &best_mv.as_fullmv, + best_mv_stats, NULL); + + av1_make_default_subpel_ms_params(ms_params, cpi, mb, subblock_size, ref_mv, + cost_list); + ms_params->forced_stop = EIGHTH_PEL; + ms_params->var_params.subpel_search_type = subpel_ms_type; + // Since we are merely refining the result from full pixel + // search, we don't need regularization for subpel search + ms_params->mv_cost_params.mv_cost_type = MV_COST_NONE; + best_mv_stats->err_cost = 0; + + MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); + assert(av1_is_subpelmv_in_range(&ms_params->mv_limits, subpel_start_mv)); + error = cpi->mv_search_params.find_fractional_mv_step( + &mb->e_mbd, &cpi->common, ms_params, subpel_start_mv, best_mv_stats, + &best_mv.as_mv, &distortion, &sse, NULL); + + subblock_mses[idx] = DIVIDE_AND_ROUND(error, subblock_pels); + subblock_mvs[idx] = best_mv.as_mv; +} + // This function returns the minimum and maximum log variances for 4x4 sub // blocks in the current block. static inline void get_log_var_4x4sub_blk( @@ -236,6 +318,10 @@ FULLPEL_MV_STATS best_mv_stats; int block_mse = INT_MAX; MV block_mv = kZeroMv; + // 32x32 block motion search results. + int midblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + MV midblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv }; + const int q = get_q(cpi); av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size, @@ -289,54 +375,49 @@ *is_dc_diff_large = 50 * error < sse; if (src_var <= 2 * (int64_t)distortion) *is_low_cntras = 1; + // On 4 mid-blocks in the 64x64 tf block. if (allow_me_for_sub_blks) { - // On 4 sub-blocks. - const BLOCK_SIZE subblock_size = av1_ss_size_lookup[block_size][1][1]; - const int subblock_height = block_size_high[subblock_size]; - const int subblock_width = block_size_wide[subblock_size]; - const int subblock_pels = subblock_height * subblock_width; - start_mv = get_fullmv_from_mv(ref_mv); + // midblock_size is 32x32, which corresponds to each 32x32 block in the + // 64x64 block. + const BLOCK_SIZE midblock_size = av1_ss_size_lookup[block_size][1][1]; + const int midblock_height = block_size_high[midblock_size]; + const int midblock_width = block_size_wide[midblock_size]; + int midblock_idx = 0; + for (int i = 0; i < mb_height; i += midblock_height) { + for (int j = 0; j < mb_width; j += midblock_width) { + start_mv = get_fullmv_from_mv(ref_mv); - int subblock_idx = 0; - for (int i = 0; i < mb_height; i += subblock_height) { - for (int j = 0; j < mb_width; j += subblock_width) { - const int offset = i * y_stride + j; - mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset; - mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset; - av1_make_default_fullpel_ms_params( - &full_ms_params, cpi, mb, subblock_size, &baseline_mv, start_mv, - search_site_cfg, search_method, - /*fine_search_interval=*/0); - full_ms_params.run_mesh_search = 1; - full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type; + subblock_motion_search( + cpi, mb, frame_to_filter, ref_frame, block_size, mb_row, mb_col, + midblock_size, midblock_idx, i, j, &full_ms_params, &ms_params, + &baseline_mv, start_mv, search_site_cfg, search_method, + mv_cost_type, step_param, subpel_search_type, midblock_mvs, + midblock_mses, &best_mv_stats, q); - if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) { - // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1. - full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1; - full_ms_params.mesh_search_mv_diff_threshold = 2; + // On 4 sub-blocks in 1 mid-block. + { + const BLOCK_SIZE subblock_size = + av1_ss_size_lookup[midblock_size][1][1]; + const int subblock_height = block_size_high[subblock_size]; + const int subblock_width = block_size_wide[subblock_size]; + + start_mv = get_fullmv_from_mv(&midblock_mvs[midblock_idx]); + + int bidx = midblock_idx * 4; + for (int bi = 0; bi < midblock_height; bi += subblock_height) { + for (int bj = 0; bj < midblock_width; bj += subblock_width) { + subblock_motion_search( + cpi, mb, frame_to_filter, ref_frame, block_size, mb_row, + mb_col, subblock_size, bidx, (i + bi), (j + bj), + &full_ms_params, &ms_params, &baseline_mv, start_mv, + search_site_cfg, search_method, mv_cost_type, step_param, + subpel_search_type, subblock_mvs, subblock_mses, + &best_mv_stats, q); + ++bidx; + } + } } - av1_full_pixel_search(start_mv, &full_ms_params, step_param, - cond_cost_list(cpi, cost_list), - &best_mv.as_fullmv, &best_mv_stats, NULL); - - av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size, - &baseline_mv, cost_list); - ms_params.forced_stop = EIGHTH_PEL; - ms_params.var_params.subpel_search_type = subpel_search_type; - // Since we are merely refining the result from full pixel search, we - // don't need regularization for subpel search - ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE; - best_mv_stats.err_cost = 0; - - subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); - assert( - av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)); - error = cpi->mv_search_params.find_fractional_mv_step( - &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, - &best_mv_stats, &best_mv.as_mv, &distortion, &sse, NULL); - subblock_mses[subblock_idx] = DIVIDE_AND_ROUND(error, subblock_pels); - subblock_mvs[subblock_idx] = best_mv.as_mv; - ++subblock_idx; + ++midblock_idx; } } } @@ -348,11 +429,11 @@ // Make partition decision. if (allow_me_for_sub_blks) { - tf_determine_block_partition(block_mv, block_mse, subblock_mvs, - subblock_mses); + tf_determine_block_partition(block_mv, block_mse, midblock_mvs, + midblock_mses, subblock_mvs, subblock_mses); } else { - // Copy 32X32 block mv and mse values to sub blocks - for (int i = 0; i < 4; ++i) { + // Copy 64X64 block mv and mse values to sub blocks + for (int i = 0; i < 16; ++i) { subblock_mvs[i] = block_mv; subblock_mses[i] = block_mse; } @@ -372,6 +453,8 @@ // block_mv: Motion vector for the entire block (ONLY as reference). // block_mse: Motion search error (MSE) for the entire block (ONLY as // reference). +// midblock_mvs: Pointer to the motion vectors for 4 mid-blocks. +// midblock_mses: Pointer to the search errors (MSE) for 4 mid-blocks. // subblock_mvs: Pointer to the motion vectors for 4 sub-blocks (will be // modified based on the partition decision). // subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks (will @@ -380,23 +463,53 @@ // Nothing will be returned. Results are saved in `subblock_mvs` and // `subblock_mses`. static void tf_determine_block_partition(const MV block_mv, const int block_mse, + const MV *midblock_mvs, + const int *midblock_mses, MV *subblock_mvs, int *subblock_mses) { int min_subblock_mse = INT_MAX; int max_subblock_mse = INT_MIN; int64_t sum_subblock_mse = 0; - for (int i = 0; i < 4; ++i) { + int i; + + // Go through 4 32x32 blocks. + for (int idx = 0; idx < 4; ++idx) { + min_subblock_mse = INT_MAX; + max_subblock_mse = INT_MIN; + sum_subblock_mse = 0; + + const int sub_idx = idx * 4; + for (i = sub_idx; i < sub_idx + 4; ++i) { + sum_subblock_mse += subblock_mses[i]; + min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]); + max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]); + } + + if (((midblock_mses[idx] * 15 <= sum_subblock_mse * 4) && + max_subblock_mse - min_subblock_mse < 48) || + ((midblock_mses[idx] * 14 <= sum_subblock_mse * 4) && + max_subblock_mse - min_subblock_mse < 24)) { // No split. + for (i = sub_idx; i < sub_idx + 4; ++i) { + subblock_mvs[i] = midblock_mvs[idx]; + subblock_mses[i] = midblock_mses[idx]; + } + } + } + + min_subblock_mse = INT_MAX; + max_subblock_mse = INT_MIN; + sum_subblock_mse = 0; + for (i = 0; i < 16; ++i) { sum_subblock_mse += subblock_mses[i]; min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]); max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]); } - // TODO(any): The following magic numbers may be tuned to improve the - // performance OR find a way to get rid of these magic numbers. - if (((block_mse * 15 < sum_subblock_mse * 4) && - max_subblock_mse - min_subblock_mse < 48) || - ((block_mse * 14 < sum_subblock_mse * 4) && - max_subblock_mse - min_subblock_mse < 24)) { // No split. - for (int i = 0; i < 4; ++i) { + if (((block_mse * 15 <= sum_subblock_mse) && + (max_subblock_mse - min_subblock_mse) * 16 < sum_subblock_mse * 3) || + ((block_mse * 14 <= sum_subblock_mse) && + (max_subblock_mse - min_subblock_mse) * 8 < + sum_subblock_mse)) { // No split. + for (i = 0; i < 16; ++i) { subblock_mvs[i] = block_mv; subblock_mses[i] = block_mse; } @@ -466,35 +579,45 @@ const int plane_w = mb_width >> subsampling_x; // Plane width. const int plane_y = mb_y >> subsampling_y; // Y-coord (Top-left). const int plane_x = mb_x >> subsampling_x; // X-coord (Top-left). - const int h = plane_h >> 1; // Sub-block height. - const int w = plane_w >> 1; // Sub-block width. - const int is_y_plane = (plane == 0); // Is Y-plane? + + const int h32 = plane_h >> 1; // 32x32 sub-block height. + const int w32 = plane_w >> 1; // 32x32 sub-block width. + const int h16 = plane_h >> 2; // 16x16 sub-block height. + const int w16 = plane_w >> 2; // 16x16 sub-block width. + + const int is_y_plane = (plane == 0); // Is Y-plane? const struct buf_2d ref_buf = { NULL, ref_frame->buffers[plane], ref_frame->widths[is_y_plane ? 0 : 1], ref_frame->heights[is_y_plane ? 0 : 1], ref_frame->strides[is_y_plane ? 0 : 1] }; - // Handle each subblock. - int subblock_idx = 0; - for (int i = 0; i < plane_h; i += h) { - for (int j = 0; j < plane_w; j += w) { - // Choose proper motion vector. - const MV mv = subblock_mvs[subblock_idx++]; - assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX && - mv.col >= INT16_MIN && mv.col <= INT16_MAX); + const int sub_y[4] = { 0, 0, h32, h32 }; + const int sub_x[4] = { 0, w32, 0, w32 }; + // Handle each 16x16 subblock. + for (int idx = 0; idx < 4; ++idx) { + int subblock_idx = idx * 4; + for (int i = 0; i < h32; i += h16) { + for (int j = 0; j < w32; j += w16) { + // Choose proper motion vector. + const MV mv = subblock_mvs[subblock_idx++]; + assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX && + mv.col >= INT16_MIN && mv.col <= INT16_MAX); - const int y = plane_y + i; - const int x = plane_x + j; + const int y = plane_y + sub_y[idx] + i; + const int x = plane_x + sub_x[idx] + j; - // Build predictior for each sub-block on current plane. - InterPredParams inter_pred_params; - av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x, - subsampling_y, bit_depth, is_high_bitdepth, - is_intrabc, scale, &ref_buf, interp_filters); - inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth); - av1_enc_build_one_inter_predictor(&pred[plane_offset + i * plane_w + j], - plane_w, &mv, &inter_pred_params); + // Build predictior for each sub-block on current plane. + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, w16, h16, y, x, + subsampling_x, subsampling_y, bit_depth, + is_high_bitdepth, is_intrabc, scale, &ref_buf, + interp_filters); + inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth); + av1_enc_build_one_inter_predictor( + &pred[plane_offset + (sub_y[idx] + i) * plane_w + sub_x[idx] + j], + plane_w, &mv, &inter_pred_params); + } } } plane_offset += plane_h * plane_w; @@ -711,16 +834,6 @@ const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); decay_factor[plane] = 1 / (n_decay * q_decay * s_decay); } - double d_factor[4] = { 0 }; - for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { - // Larger motion vector -> smaller filtering weight. - const MV mv = subblock_mvs[subblock_idx]; - const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); - double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; - distance_threshold = AOMMAX(distance_threshold, 1); - d_factor[subblock_idx] = distance / distance_threshold; - d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); - } // Allocate memory for pixel-wise squared differences. They, // regardless of the subsampling, are assigned with memory of size `mb_pels`. @@ -796,14 +909,26 @@ // Combine window error and block error, and normalize it. const double window_error = sum_square_diff * inv_num_ref_pixels; - const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2); + + // 16x16 block index + const int y32 = i / (h / 2); + const int x32 = j / (w / 2); + const int y16 = (i % (h / 2)) / (h / 4); + const int x16 = (j % (w / 2)) / (w / 4); + const int subblock_idx = (y32 * 2 + x32) * 4 + (y16 * 2 + x16); const double block_error = (double)subblock_mses[subblock_idx]; const double combined_error = weight_factor * window_error + block_error * inv_factor; + // Larger motion vector -> smaller filtering weight. + const MV mv = subblock_mvs[subblock_idx]; + const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); + const double distance_threshold = + (double)AOMMAX(min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD, 1); + const double d_factor = AOMMAX(distance / distance_threshold, 1); + // Compute filter weight. - double scaled_error = - combined_error * d_factor[subblock_idx] * decay_factor[plane]; + double scaled_error = combined_error * d_factor * decay_factor[plane]; scaled_error = AOMMIN(scaled_error, 7); int weight; if (tf_wgt_calc_lvl == 0) { @@ -930,6 +1055,17 @@ const GF_GROUP *gf_group = &cpi->ppi->gf_group; const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index]; + // Determine whether the video is with `YUV 4:2:2` format, since the avx2/sse2 + // function only supports square block size. We will use C function instead + // for videos with `YUV 4:2:2` format. + int is_yuv422_format = 0; + for (int plane = 1; plane < num_planes; ++plane) { + if (mbd->plane[plane].subsampling_x != mbd->plane[plane].subsampling_y) { + is_yuv422_format = 1; + break; + } + } + // Do filtering. FRAME_DIFF *diff = &td->tf_data.diff; av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits, @@ -967,8 +1103,16 @@ if (frames[frame] == NULL) continue; // Motion search. - MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv }; - int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + // block size is 64x64. 16 16x16 in 1 64x64. + // Store motion search results in 16x16 units. + MV subblock_mvs[16] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv, + kZeroMv, kZeroMv, kZeroMv, kZeroMv, + kZeroMv, kZeroMv, kZeroMv, kZeroMv, + kZeroMv, kZeroMv, kZeroMv, kZeroMv }; + int subblock_mses[16] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, + INT_MAX, INT_MAX, INT_MAX, INT_MAX, + INT_MAX, INT_MAX, INT_MAX, INT_MAX, + INT_MAX, INT_MAX, INT_MAX, INT_MAX }; int is_dc_diff_large = 0; int is_low_cntras = 0; @@ -1006,7 +1150,8 @@ // only supports 32x32 block size and 5x5 filtering window. if (is_frame_high_bitdepth(frame_to_filter)) { // for high bit-depth #if CONFIG_AV1_HIGHBITDEPTH - if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) { + if (!is_yuv422_format && TF_BLOCK_SIZE == BLOCK_32X32 && + TF_WINDOW_LENGTH == 5) { av1_highbd_apply_temporal_filter( frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, noise_levels, subblock_mvs, subblock_mses, q_factor, @@ -1022,7 +1167,8 @@ #endif // CONFIG_AV1_HIGHBITDEPTH } else { // for 8-bit - if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) { + if (!is_yuv422_format && TF_BLOCK_SIZE == BLOCK_32X32 && + TF_WINDOW_LENGTH == 5) { av1_apply_temporal_filter( frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, noise_levels, subblock_mvs, subblock_mses, q_factor,
diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h index 9585942..3f93ab0 100644 --- a/av1/encoder/temporal_filter.h +++ b/av1/encoder/temporal_filter.h
@@ -30,7 +30,7 @@ #define BW 32 // Block size used in temporal filtering. -#define TF_BLOCK_SIZE BLOCK_32X32 +#define TF_BLOCK_SIZE BLOCK_64X64 // Window size for temporal filtering. #define TF_WINDOW_LENGTH 5
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc index 2e0ae0f..8d9afaa 100644 --- a/test/temporal_filter_test.cc +++ b/test/temporal_filter_test.cc
@@ -131,9 +131,9 @@ ColorFormat color_fmt) { aom_usec_timer ref_timer, test_timer; const BLOCK_SIZE block_size = TF_BLOCK_SIZE; - static_assert(block_size == BLOCK_32X32, ""); - const int width = 32; - const int height = 32; + static_assert(block_size == BLOCK_64X64, ""); + const int width = 64; + const int height = 64; int num_planes = MAX_MB_PLANE; int subsampling_x = 0; int subsampling_y = 0; @@ -174,7 +174,7 @@ memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0])); memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0])); - static_assert(width == 32 && height == 32, ""); + static_assert(width == 64 && height == 64, ""); const MV subblock_mvs[4] = { { 0, 0 }, { 5, 5 }, { 7, 8 }, { 2, 10 } }; const int subblock_mses[4] = { 15, 16, 17, 18 }; const int q_factor = 12; @@ -284,39 +284,46 @@ RunTest(1, 100000, I444); } -#if HAVE_AVX2 -TemporalFilterFuncParam temporal_filter_test_avx2[] = { TemporalFilterFuncParam( - &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_avx2) }; -INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterTest, - Combine(ValuesIn(temporal_filter_test_avx2), - Values(0, 1))); -#endif // HAVE_AVX2 +// av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function +// needs to be updated. +// #if HAVE_AVX2 +// TemporalFilterFuncParam temporal_filter_test_avx2[] = { +// TemporalFilterFuncParam( +// &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_avx2) }; +// INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterTest, +// Combine(ValuesIn(temporal_filter_test_avx2), +// Values(0, 1))); +// #endif // HAVE_AVX2 +// +// #if HAVE_SSE2 +// TemporalFilterFuncParam temporal_filter_test_sse2[] = { +// TemporalFilterFuncParam( +// &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_sse2) }; +// INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterTest, +// Combine(ValuesIn(temporal_filter_test_sse2), +// Values(0, 1))); +// #endif // HAVE_SSE2 -#if HAVE_SSE2 -TemporalFilterFuncParam temporal_filter_test_sse2[] = { TemporalFilterFuncParam( - &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_sse2) }; -INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterTest, - Combine(ValuesIn(temporal_filter_test_sse2), - Values(0, 1))); -#endif // HAVE_SSE2 +// av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function +// needs to be updated. +// #if HAVE_NEON +// TemporalFilterFuncParam temporal_filter_test_neon[] = { +// TemporalFilterFuncParam( +// &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_neon) }; +// INSTANTIATE_TEST_SUITE_P(NEON, TemporalFilterTest, +// Combine(ValuesIn(temporal_filter_test_neon), +// Values(0, 1))); +// #endif // HAVE_NEON -#if HAVE_NEON -TemporalFilterFuncParam temporal_filter_test_neon[] = { TemporalFilterFuncParam( - &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_neon) }; -INSTANTIATE_TEST_SUITE_P(NEON, TemporalFilterTest, - Combine(ValuesIn(temporal_filter_test_neon), - Values(0, 1))); -#endif // HAVE_NEON - -#if HAVE_NEON_DOTPROD -TemporalFilterFuncParam temporal_filter_test_neon_dotprod[] = { - TemporalFilterFuncParam(&av1_apply_temporal_filter_c, - &av1_apply_temporal_filter_neon_dotprod) -}; -INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, TemporalFilterTest, - Combine(ValuesIn(temporal_filter_test_neon_dotprod), - Values(0, 1))); -#endif // HAVE_NEON_DOTPROD +// #if HAVE_NEON_DOTPROD +// TemporalFilterFuncParam temporal_filter_test_neon_dotprod[] = { +// TemporalFilterFuncParam(&av1_apply_temporal_filter_c, +// &av1_apply_temporal_filter_neon_dotprod) +// }; +// INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, TemporalFilterTest, +// Combine(ValuesIn(temporal_filter_test_neon_dotprod), +// Values(0, 1))); +// #endif // HAVE_NEON_DOTPROD #if HAVE_AVX2 || HAVE_NEON // Width and height for which av1_estimate_noise_from_single_plane() will be @@ -514,9 +521,9 @@ ColorFormat color_fmt) { aom_usec_timer ref_timer, test_timer; const BLOCK_SIZE block_size = TF_BLOCK_SIZE; - static_assert(block_size == BLOCK_32X32, ""); - const int width = 32; - const int height = 32; + static_assert(block_size == BLOCK_64X64, ""); + const int width = 64; + const int height = 64; int num_planes = MAX_MB_PLANE; int subsampling_x = 0; int subsampling_y = 0; @@ -557,7 +564,7 @@ memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0])); memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0])); - static_assert(width == 32 && height == 32, ""); + static_assert(width == 64 && height == 64, ""); const MV subblock_mvs[4] = { { 0, 0 }, { 5, 5 }, { 7, 8 }, { 2, 10 } }; const int subblock_mses[4] = { 15, 16, 17, 18 }; const int q_factor = 12; @@ -667,34 +674,39 @@ RunTest(1, 100000, 10, I422); RunTest(1, 100000, 10, I444); } -#if HAVE_SSE2 -HBDTemporalFilterFuncParam HBDtemporal_filter_test_sse2[] = { - HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c, - &av1_highbd_apply_temporal_filter_sse2) -}; -INSTANTIATE_TEST_SUITE_P(SSE2, HBDTemporalFilterTest, - Combine(ValuesIn(HBDtemporal_filter_test_sse2), - Values(0, 1))); -#endif // HAVE_SSE2 -#if HAVE_AVX2 -HBDTemporalFilterFuncParam HBDtemporal_filter_test_avx2[] = { - HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c, - &av1_highbd_apply_temporal_filter_avx2) -}; -INSTANTIATE_TEST_SUITE_P(AVX2, HBDTemporalFilterTest, - Combine(ValuesIn(HBDtemporal_filter_test_avx2), - Values(0, 1))); -#endif // HAVE_AVX2 -#if HAVE_NEON -HBDTemporalFilterFuncParam HBDtemporal_filter_test_neon[] = { - HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c, - &av1_highbd_apply_temporal_filter_neon) -}; -INSTANTIATE_TEST_SUITE_P(NEON, HBDTemporalFilterTest, - Combine(ValuesIn(HBDtemporal_filter_test_neon), - Values(0, 1))); -#endif // HAVE_NEON +// av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function +// needs to be updated. +// #if HAVE_SSE2 +// HBDTemporalFilterFuncParam HBDtemporal_filter_test_sse2[] = { +// HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c, +// &av1_highbd_apply_temporal_filter_sse2) +//}; +// INSTANTIATE_TEST_SUITE_P(SSE2, HBDTemporalFilterTest, +// Combine(ValuesIn(HBDtemporal_filter_test_sse2), +// Values(0, 1))); +// #endif // HAVE_SSE2 +// #if HAVE_AVX2 +// HBDTemporalFilterFuncParam HBDtemporal_filter_test_avx2[] = { +// HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c, +// &av1_highbd_apply_temporal_filter_avx2) +//}; +// INSTANTIATE_TEST_SUITE_P(AVX2, HBDTemporalFilterTest, +// Combine(ValuesIn(HBDtemporal_filter_test_avx2), +// Values(0, 1))); +// #endif // HAVE_AVX2 + +// av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function +// needs to be updated. +// #if HAVE_NEON +// HBDTemporalFilterFuncParam HBDtemporal_filter_test_neon[] = { +// HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c, +// &av1_highbd_apply_temporal_filter_neon) +//}; +// INSTANTIATE_TEST_SUITE_P(NEON, HBDTemporalFilterTest, +// Combine(ValuesIn(HBDtemporal_filter_test_neon), +// Values(0, 1))); +// #endif // HAVE_NEON using HBDEstimateNoiseFunc = double (*)(const uint16_t *src, int height, int width, int stride, int bit_depth,