av1_fwd_txfm2d_neon.c: Use switch for small square problem sizes
For the smallest problem sizes we have a significant overhead from
needing to load and store the transform intermediate vectors. Avoiding
the kernel lookup and calling the kernels directly significantly
improves performance in these cases.
Benchmarking on a Neoverse N2 machine with Clang 16 and GCC 12, the
speed tests report a geomean ~22% reduction in times reported for 4x4
and a ~4.6% reduction for 8x8.
Change-Id: I5907fab09e40b1cea57c446fd7e604ae911ceae8
diff --git a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
index a17a41a..d70f5a5 100644
--- a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
+++ b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
@@ -1598,44 +1598,6 @@
int32_t *output, int stride,
int cos_bit);
-static const col_transform_1d_lbd_4_neon col_txfm4x4_arr[TX_TYPES] = {
- fdct4x4_col_neon, // DCT_DCT
- fadst4x4_col_neon, // ADST_DCT
- fdct4x4_col_neon, // DCT_ADST
- fadst4x4_col_neon, // ADST_ADST
- fadst4x4_col_neon, // FLIPADST_DCT
- fdct4x4_col_neon, // DCT_FLIPADST
- fadst4x4_col_neon, // FLIPADST_FLIPADST
- fadst4x4_col_neon, // ADST_FLIPADST
- fadst4x4_col_neon, // FLIPADST_ADST
- fidentity4x4_col_neon, // IDTX
- fdct4x4_col_neon, // V_DCT
- fidentity4x4_col_neon, // H_DCT
- fadst4x4_col_neon, // V_ADST
- fidentity4x4_col_neon, // H_ADST
- fadst4x4_col_neon, // V_FLIPADST
- fidentity4x4_col_neon // H_FLIPADST
-};
-
-static const row_transform_1d_lbd_4_neon row_txfm4x4_arr[TX_TYPES] = {
- fdct4x4_row_neon, // DCT_DCT
- fdct4x4_row_neon, // ADST_DCT
- fadst4x4_row_neon, // DCT_ADST
- fadst4x4_row_neon, // ADST_ADST
- fdct4x4_row_neon, // FLIPADST_DCT
- fadst4x4_row_neon, // DCT_FLIPADST
- fadst4x4_row_neon, // FLIPADST_FLIPADST
- fadst4x4_row_neon, // ADST_FLIPADST
- fadst4x4_row_neon, // FLIPADST_ADST
- fidentity4x4_row_neon, // IDTX
- fidentity4x4_row_neon, // V_DCT
- fdct4x4_row_neon, // H_DCT
- fidentity4x4_row_neon, // V_ADST
- fadst4x4_row_neon, // H_ADST
- fidentity4x4_row_neon, // V_FLIPADST
- fadst4x4_row_neon // H_FLIPADST
-};
-
static const col_transform_1d_lbd_4_neon col_txfm4x8_arr[TX_TYPES] = {
fdct4x8_col_neon, // DCT_DCT
fadst4x8_col_neon, // ADST_DCT
@@ -1943,21 +1905,96 @@
static void lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output,
int stride, TX_TYPE tx_type, int bd) {
(void)bd;
- int16x4_t buf0[4], buf1[4];
- const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x4_arr[tx_type];
- const row_transform_1d_lbd_4_neon row_txfm = row_txfm4x4_arr[tx_type];
int ud_flip, lr_flip;
-
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
- col_txfm(input, buf0, stride, 13);
- transpose_arrays_s16_4x4(buf0, buf1);
- if (lr_flip) {
- flip_buf_4_neon(buf1, buf0, 4);
- row_txfm(buf0, output, 4, 13);
- } else {
- row_txfm(buf1, output, 4, 13);
+ int16x4_t buf0[4], buf1[4];
+ switch (tx_type) {
+ case DCT_DCT:
+ fdct4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fdct4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case ADST_DCT:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fdct4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case DCT_ADST:
+ fdct4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fadst4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case ADST_ADST:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fadst4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case FLIPADST_DCT:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fdct4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case DCT_FLIPADST:
+ fdct4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ flip_buf_4_neon(buf1, buf0, 4);
+ fadst4x4_row_neon(buf0, output, 4, 13);
+ break;
+ case FLIPADST_FLIPADST:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ flip_buf_4_neon(buf1, buf0, 4);
+ fadst4x4_row_neon(buf0, output, 4, 13);
+ break;
+ case ADST_FLIPADST:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ flip_buf_4_neon(buf1, buf0, 4);
+ fadst4x4_row_neon(buf0, output, 4, 13);
+ break;
+ case FLIPADST_ADST:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fadst4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case IDTX:
+ fidentity4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fidentity4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case V_DCT:
+ fdct4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fidentity4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case H_DCT:
+ fidentity4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fdct4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case V_ADST:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fidentity4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case H_ADST:
+ fidentity4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fadst4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case V_FLIPADST:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fidentity4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case H_FLIPADST:
+ fidentity4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ flip_buf_4_neon(buf1, buf0, 4);
+ fadst4x4_row_neon(buf0, output, 4, 13);
+ break;
}
}
@@ -2040,22 +2077,113 @@
static void lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output,
int stride, TX_TYPE tx_type, int bd) {
(void)bd;
- int16x8_t buf0[8], buf1[8];
- const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
- const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type];
int ud_flip, lr_flip;
-
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
- col_txfm(input, buf0, stride, 13);
- shift_right_1_round_s16_x8(buf0, buf0, 8);
- transpose_arrays_s16_8x8(buf0, buf1);
- if (lr_flip) {
- flip_buf_8_neon(buf1, buf0, 8);
- row_txfm(buf0, output, 8, 13);
- } else {
- row_txfm(buf1, output, 8, 13);
+ int16x8_t buf0[8], buf1[8];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ fdct8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fdct8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case ADST_DCT:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fdct8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case DCT_ADST:
+ fdct8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fadst8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case ADST_ADST:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fadst8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case FLIPADST_DCT:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fdct8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case DCT_FLIPADST:
+ fdct8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ flip_buf_8_neon(buf1, buf0, 8);
+ fadst8x8_row_neon(buf0, output, 8, 13);
+ break;
+ case FLIPADST_FLIPADST:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ flip_buf_8_neon(buf1, buf0, 8);
+ fadst8x8_row_neon(buf0, output, 8, 13);
+ break;
+ case ADST_FLIPADST:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ flip_buf_8_neon(buf1, buf0, 8);
+ fadst8x8_row_neon(buf0, output, 8, 13);
+ break;
+ case FLIPADST_ADST:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fadst8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case IDTX:
+ fidentity8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fidentity8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case V_DCT:
+ fdct8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fidentity8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case H_DCT:
+ fidentity8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fdct8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case V_ADST:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fidentity8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case H_ADST:
+ fidentity8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fadst8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case V_FLIPADST:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fidentity8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case H_FLIPADST:
+ fidentity8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ flip_buf_8_neon(buf1, buf0, 8);
+ fadst8x8_row_neon(buf0, output, 8, 13);
+ break;
}
}