Remove duplicate Neon transpose_concat* helpers
Move Neon transpose_concat* helper functions to transpose_neon.h and
delete duplicate definitions.
Change-Id: Iacb083e98a5c132767aa4bb5cecd7239ff940257
diff --git a/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
index d714fdd..6013a33 100644
--- a/aom_dsp/arm/aom_convolve8_neon_dotprod.c
+++ b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -289,57 +289,6 @@
}
}
-static inline void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
- int8x8_t a3, int8x16_t *b) {
- // Transpose 8-bit elements and concatenate result rows as follows:
- // a0: 00, 01, 02, 03, XX, XX, XX, XX
- // a1: 10, 11, 12, 13, XX, XX, XX, XX
- // a2: 20, 21, 22, 23, XX, XX, XX, XX
- // a3: 30, 31, 32, 33, XX, XX, XX, XX
- //
- // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-
- int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
- int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
- int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
- int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
-
- int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
- int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
-
- int16x8_t a0123 =
- vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0];
-
- *b = vreinterpretq_s8_s16(a0123);
-}
-
-static inline void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
- int8x8_t a3, int8x16_t *b0,
- int8x16_t *b1) {
- // Transpose 8-bit elements and concatenate result rows as follows:
- // a0: 00, 01, 02, 03, 04, 05, 06, 07
- // a1: 10, 11, 12, 13, 14, 15, 16, 17
- // a2: 20, 21, 22, 23, 24, 25, 26, 27
- // a3: 30, 31, 32, 33, 34, 35, 36, 37
- //
- // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
-
- int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
- int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
- int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
- int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
-
- int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
- int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
-
- int16x8x2_t a0123 =
- vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23));
-
- *b0 = vreinterpretq_s8_s16(a0123.val[0]);
- *b1 = vreinterpretq_s8_s16(a0123.val[1]);
-}
-
static inline int16x4_t convolve8_4_v(const int8x16_t samples_lo,
const int8x16_t samples_hi,
const int8x8_t filters) {
@@ -403,10 +352,10 @@
// This operation combines a conventional transpose and the sample permute
// (see horizontal case) required before computing the dot product.
int8x16_t s0123, s1234, s2345, s3456;
- transpose_concat_4x4(s0, s1, s2, s3, &s0123);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456);
+ transpose_concat_elems_s8_4x4(s0, s1, s2, s3, &s0123);
+ transpose_concat_elems_s8_4x4(s1, s2, s3, s4, &s1234);
+ transpose_concat_elems_s8_4x4(s2, s3, s4, s5, &s2345);
+ transpose_concat_elems_s8_4x4(s3, s4, s5, s6, &s3456);
do {
uint8x8_t t7, t8, t9, t10;
@@ -418,7 +367,7 @@
int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
int8x16_t s4567, s5678, s6789, s78910;
- transpose_concat_4x4(s7, s8, s9, s10, &s78910);
+ transpose_concat_elems_s8_4x4(s7, s8, s9, s10, &s78910);
// Merge new data into block from previous iteration.
samples_LUT.val[0] = s3456;
@@ -472,10 +421,10 @@
// (see horizontal case) required before computing the dot product.
int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
s3456_lo, s3456_hi;
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+ transpose_concat_elems_s8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+ transpose_concat_elems_s8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+ transpose_concat_elems_s8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+ transpose_concat_elems_s8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
do {
uint8x8_t t7, t8, t9, t10;
@@ -488,7 +437,7 @@
int8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
s78910_lo, s78910_hi;
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
+ transpose_concat_elems_s8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
// Merge new data into block from previous iteration.
samples_LUT.val[0] = s3456_lo;
diff --git a/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
index 6d1ab96..df85319 100644
--- a/aom_dsp/arm/aom_convolve8_neon_i8mm.c
+++ b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -269,58 +269,6 @@
}
}
-static inline void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
- uint8x8_t a2, uint8x8_t a3,
- uint8x16_t *b) {
- // Transpose 8-bit elements and concatenate result rows as follows:
- // a0: 00, 01, 02, 03, XX, XX, XX, XX
- // a1: 10, 11, 12, 13, XX, XX, XX, XX
- // a2: 20, 21, 22, 23, XX, XX, XX, XX
- // a3: 30, 31, 32, 33, XX, XX, XX, XX
- //
- // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-
- uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
- uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
- uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
- uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
-
- uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
- uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
-
- uint16x8_t a0123 =
- vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0];
-
- *b = vreinterpretq_u8_u16(a0123);
-}
-
-static inline void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
- uint8x8_t a2, uint8x8_t a3,
- uint8x16_t *b0, uint8x16_t *b1) {
- // Transpose 8-bit elements and concatenate result rows as follows:
- // a0: 00, 01, 02, 03, 04, 05, 06, 07
- // a1: 10, 11, 12, 13, 14, 15, 16, 17
- // a2: 20, 21, 22, 23, 24, 25, 26, 27
- // a3: 30, 31, 32, 33, 34, 35, 36, 37
- //
- // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
-
- uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
- uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
- uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
- uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
-
- uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
- uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
-
- uint16x8x2_t a0123 =
- vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23));
-
- *b0 = vreinterpretq_u8_u16(a0123.val[0]);
- *b1 = vreinterpretq_u8_u16(a0123.val[1]);
-}
-
static inline int16x4_t convolve8_4_v(const uint8x16_t samples_lo,
const uint8x16_t samples_hi,
const int8x8_t filters) {
@@ -368,17 +316,17 @@
// This operation combines a conventional transpose and the sample permute
// (see horizontal case) required before computing the dot product.
uint8x16_t s0123, s1234, s2345, s3456;
- transpose_concat_4x4(s0, s1, s2, s3, &s0123);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456);
+ transpose_concat_elems_u8_4x4(s0, s1, s2, s3, &s0123);
+ transpose_concat_elems_u8_4x4(s1, s2, s3, s4, &s1234);
+ transpose_concat_elems_u8_4x4(s2, s3, s4, s5, &s2345);
+ transpose_concat_elems_u8_4x4(s3, s4, s5, s6, &s3456);
do {
uint8x8_t s7, s8, s9, s10;
load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
uint8x16_t s4567, s5678, s6789, s78910;
- transpose_concat_4x4(s7, s8, s9, s10, &s78910);
+ transpose_concat_elems_u8_4x4(s7, s8, s9, s10, &s78910);
// Merge new data into block from previous iteration.
samples_LUT.val[0] = s3456;
@@ -423,10 +371,10 @@
// (see horizontal case) required before computing the dot product.
uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
s3456_lo, s3456_hi;
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+ transpose_concat_elems_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+ transpose_concat_elems_u8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+ transpose_concat_elems_u8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+ transpose_concat_elems_u8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
do {
uint8x8_t s7, s8, s9, s10;
@@ -434,7 +382,7 @@
uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
s78910_lo, s78910_hi;
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
+ transpose_concat_elems_u8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
// Merge new data into block from previous iteration.
samples_LUT.val[0] = s3456_lo;
diff --git a/aom_dsp/arm/highbd_convolve8_sve.c b/aom_dsp/arm/highbd_convolve8_sve.c
index b5db14b..882e360 100644
--- a/aom_dsp/arm/highbd_convolve8_sve.c
+++ b/aom_dsp/arm/highbd_convolve8_sve.c
@@ -20,6 +20,7 @@
#include "aom_dsp/arm/aom_filter.h"
#include "aom_dsp/arm/highbd_convolve8_neon.h"
#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
static inline uint16x4_t highbd_convolve8_4_h(int16x8_t s[4], int16x8_t filter,
uint16x4_t max) {
@@ -276,60 +277,6 @@
6, 7, 16, 17, 18, 19, 20, 21, 14, 15, 24, 25, 26, 27, 28, 29
};
-static inline void transpose_concat_4x4(int16x4_t s0, int16x4_t s1,
- int16x4_t s2, int16x4_t s3,
- int16x8_t res[2]) {
- // Transpose 16-bit elements and concatenate result rows as follows:
- // s0: 00, 01, 02, 03
- // s1: 10, 11, 12, 13
- // s2: 20, 21, 22, 23
- // s3: 30, 31, 32, 33
- //
- // res[0]: 00 10 20 30 01 11 21 31
- // res[1]: 02 12 22 32 03 13 23 33
-
- int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0));
- int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0));
- int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0));
- int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0));
-
- int32x4_t s01 = vreinterpretq_s32_s16(vzip1q_s16(s0q, s1q));
- int32x4_t s23 = vreinterpretq_s32_s16(vzip1q_s16(s2q, s3q));
-
- int32x4x2_t s0123 = vzipq_s32(s01, s23);
-
- res[0] = vreinterpretq_s16_s32(s0123.val[0]);
- res[1] = vreinterpretq_s16_s32(s0123.val[1]);
-}
-
-static inline void transpose_concat_8x4(int16x8_t s0, int16x8_t s1,
- int16x8_t s2, int16x8_t s3,
- int16x8_t res[4]) {
- // Transpose 16-bit elements and concatenate result rows as follows:
- // s0: 00, 01, 02, 03, 04, 05, 06, 07
- // s1: 10, 11, 12, 13, 14, 15, 16, 17
- // s2: 20, 21, 22, 23, 24, 25, 26, 27
- // s3: 30, 31, 32, 33, 34, 35, 36, 37
- //
- // res_lo[0]: 00 10 20 30 01 11 21 31
- // res_lo[1]: 02 12 22 32 03 13 23 33
- // res_hi[0]: 04 14 24 34 05 15 25 35
- // res_hi[1]: 06 16 26 36 07 17 27 37
-
- int16x8x2_t tr01_16 = vzipq_s16(s0, s1);
- int16x8x2_t tr23_16 = vzipq_s16(s2, s3);
-
- int32x4x2_t tr01_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[0]),
- vreinterpretq_s32_s16(tr23_16.val[0]));
- int32x4x2_t tr23_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[1]),
- vreinterpretq_s32_s16(tr23_16.val[1]));
-
- res[0] = vreinterpretq_s16_s32(tr01_32.val[0]);
- res[1] = vreinterpretq_s16_s32(tr01_32.val[1]);
- res[2] = vreinterpretq_s16_s32(tr23_32.val[0]);
- res[3] = vreinterpretq_s16_s32(tr23_32.val[1]);
-}
-
static inline void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4],
uint8x16_t tbl, int16x8_t res[4]) {
int8x16x2_t samples0 = { vreinterpretq_s8_s16(t0[0]),
@@ -426,10 +373,10 @@
// This operation combines a conventional transpose and the sample permute
// required before computing the dot product.
int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
- transpose_concat_4x4(s0, s1, s2, s3, s0123);
- transpose_concat_4x4(s1, s2, s3, s4, s1234);
- transpose_concat_4x4(s2, s3, s4, s5, s2345);
- transpose_concat_4x4(s3, s4, s5, s6, s3456);
+ transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123);
+ transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234);
+ transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345);
+ transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456);
do {
int16x4_t s7, s8, s9, s10;
@@ -438,7 +385,7 @@
int16x8_t s4567[2], s5678[2], s6789[2], s78910[2];
// Transpose and shuffle the 4 lines that were loaded.
- transpose_concat_4x4(s7, s8, s9, s10, s78910);
+ transpose_concat_elems_s16_4x4(s7, s8, s9, s10, s78910);
// Merge new data into block from previous iteration.
aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[0], s4567);
@@ -481,10 +428,10 @@
// This operation combines a conventional transpose and the sample permute
// required before computing the dot product.
int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
- transpose_concat_8x4(s0, s1, s2, s3, s0123);
- transpose_concat_8x4(s1, s2, s3, s4, s1234);
- transpose_concat_8x4(s2, s3, s4, s5, s2345);
- transpose_concat_8x4(s3, s4, s5, s6, s3456);
+ transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123);
+ transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234);
+ transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345);
+ transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456);
do {
int16x8_t s7, s8, s9, s10;
@@ -493,7 +440,7 @@
int16x8_t s4567[4], s5678[4], s6789[4], s78910[4];
// Transpose and shuffle the 4 lines that were loaded.
- transpose_concat_8x4(s7, s8, s9, s10, s78910);
+ transpose_concat_elems_s16_8x4(s7, s8, s9, s10, s78910);
// Merge new data into block from previous iteration.
aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[0], s4567);
diff --git a/aom_dsp/arm/transpose_neon.h b/aom_dsp/arm/transpose_neon.h
index 88df0d6..aa9b7f7 100644
--- a/aom_dsp/arm/transpose_neon.h
+++ b/aom_dsp/arm/transpose_neon.h
@@ -17,6 +17,165 @@
#include "aom_dsp/aom_dsp_common.h" // For AOM_FORCE_INLINE.
#include "config/aom_config.h"
+static inline void transpose_concat_elems_u8_4x4(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x16_t *b) {
+ // Transpose 8-bit elements and concatenate result rows as follows:
+ // a0: 00, 01, 02, 03, XX, XX, XX, XX
+ // a1: 10, 11, 12, 13, XX, XX, XX, XX
+ // a2: 20, 21, 22, 23, XX, XX, XX, XX
+ // a3: 30, 31, 32, 33, XX, XX, XX, XX
+ //
+ // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+
+ uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
+ uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
+ uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
+ uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
+
+ uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
+ uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
+
+ uint16x8_t a0123 =
+ vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0];
+
+ *b = vreinterpretq_u8_u16(a0123);
+}
+
+static inline void transpose_concat_elems_u8_8x4(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x16_t *b0,
+ uint8x16_t *b1) {
+ // Transpose 8-bit elements and concatenate result rows as follows:
+ // a0: 00, 01, 02, 03, 04, 05, 06, 07
+ // a1: 10, 11, 12, 13, 14, 15, 16, 17
+ // a2: 20, 21, 22, 23, 24, 25, 26, 27
+ // a3: 30, 31, 32, 33, 34, 35, 36, 37
+ //
+ // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+
+ uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
+ uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
+ uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
+ uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
+
+ uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
+ uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
+
+ uint16x8x2_t a0123 =
+ vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23));
+
+ *b0 = vreinterpretq_u8_u16(a0123.val[0]);
+ *b1 = vreinterpretq_u8_u16(a0123.val[1]);
+}
+
+static inline void transpose_concat_elems_s8_4x4(int8x8_t a0, int8x8_t a1,
+ int8x8_t a2, int8x8_t a3,
+ int8x16_t *b) {
+ // Transpose 8-bit elements and concatenate result rows as follows:
+ // a0: 00, 01, 02, 03, XX, XX, XX, XX
+ // a1: 10, 11, 12, 13, XX, XX, XX, XX
+ // a2: 20, 21, 22, 23, XX, XX, XX, XX
+ // a3: 30, 31, 32, 33, XX, XX, XX, XX
+ //
+ // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+
+ int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
+ int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
+ int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
+ int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
+
+ int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
+ int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
+
+ int16x8_t a0123 =
+ vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0];
+
+ *b = vreinterpretq_s8_s16(a0123);
+}
+
+static inline void transpose_concat_elems_s8_8x4(int8x8_t a0, int8x8_t a1,
+ int8x8_t a2, int8x8_t a3,
+ int8x16_t *b0, int8x16_t *b1) {
+ // Transpose 8-bit elements and concatenate result rows as follows:
+ // a0: 00, 01, 02, 03, 04, 05, 06, 07
+ // a1: 10, 11, 12, 13, 14, 15, 16, 17
+ // a2: 20, 21, 22, 23, 24, 25, 26, 27
+ // a3: 30, 31, 32, 33, 34, 35, 36, 37
+ //
+ // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+
+ int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
+ int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
+ int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
+ int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
+
+ int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
+ int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
+
+ int16x8x2_t a0123 =
+ vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23));
+
+ *b0 = vreinterpretq_s8_s16(a0123.val[0]);
+ *b1 = vreinterpretq_s8_s16(a0123.val[1]);
+}
+
+static inline void transpose_concat_elems_s16_4x4(int16x4_t s0, int16x4_t s1,
+ int16x4_t s2, int16x4_t s3,
+ int16x8_t res[2]) {
+ // Transpose 16-bit elements and concatenate result rows as follows:
+ // s0: 00, 01, 02, 03
+ // s1: 10, 11, 12, 13
+ // s2: 20, 21, 22, 23
+ // s3: 30, 31, 32, 33
+ //
+ // res[0]: 00 10 20 30 01 11 21 31
+ // res[1]: 02 12 22 32 03 13 23 33
+
+ int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0));
+ int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0));
+ int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0));
+ int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0));
+
+ int32x4_t s01 = vreinterpretq_s32_s16(vzipq_s16(s0q, s1q).val[0]);
+ int32x4_t s23 = vreinterpretq_s32_s16(vzipq_s16(s2q, s3q).val[0]);
+
+ int32x4x2_t s0123 = vzipq_s32(s01, s23);
+
+ res[0] = vreinterpretq_s16_s32(s0123.val[0]);
+ res[1] = vreinterpretq_s16_s32(s0123.val[1]);
+}
+
+static inline void transpose_concat_elems_s16_8x4(int16x8_t s0, int16x8_t s1,
+ int16x8_t s2, int16x8_t s3,
+ int16x8_t res[4]) {
+ // Transpose 16-bit elements and concatenate result rows as follows:
+ // s0: 00, 01, 02, 03, 04, 05, 06, 07
+ // s1: 10, 11, 12, 13, 14, 15, 16, 17
+ // s2: 20, 21, 22, 23, 24, 25, 26, 27
+ // s3: 30, 31, 32, 33, 34, 35, 36, 37
+ //
+ // res_lo[0]: 00 10 20 30 01 11 21 31
+ // res_lo[1]: 02 12 22 32 03 13 23 33
+ // res_hi[0]: 04 14 24 34 05 15 25 35
+ // res_hi[1]: 06 16 26 36 07 17 27 37
+
+ int16x8x2_t tr01_16 = vzipq_s16(s0, s1);
+ int16x8x2_t tr23_16 = vzipq_s16(s2, s3);
+
+ int32x4x2_t tr01_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[0]),
+ vreinterpretq_s32_s16(tr23_16.val[0]));
+ int32x4x2_t tr23_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[1]),
+ vreinterpretq_s32_s16(tr23_16.val[1]));
+
+ res[0] = vreinterpretq_s16_s32(tr01_32.val[0]);
+ res[1] = vreinterpretq_s16_s32(tr01_32.val[1]);
+ res[2] = vreinterpretq_s16_s32(tr23_32.val[0]);
+ res[3] = vreinterpretq_s16_s32(tr23_32.val[1]);
+}
+
static inline void transpose_elems_u8_8x8(
uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x8_t a4,
uint8x8_t a5, uint8x8_t a6, uint8x8_t a7, uint8x8_t *o0, uint8x8_t *o1,
diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c
index 8d0d929..ab4b878 100644
--- a/av1/common/arm/convolve_neon_dotprod.c
+++ b/av1/common/arm/convolve_neon_dotprod.c
@@ -16,6 +16,7 @@
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
#include "aom_ports/mem.h"
#include "av1/common/arm/convolve_neon.h"
#include "av1/common/convolve.h"
@@ -387,57 +388,6 @@
} while (h != 0);
}
-static inline void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
- int8x8_t a3, int8x16_t *b) {
- // Transpose 8-bit elements and concatenate result rows as follows:
- // a0: 00, 01, 02, 03, XX, XX, XX, XX
- // a1: 10, 11, 12, 13, XX, XX, XX, XX
- // a2: 20, 21, 22, 23, XX, XX, XX, XX
- // a3: 30, 31, 32, 33, XX, XX, XX, XX
- //
- // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-
- int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
- int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
- int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
- int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
-
- int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
- int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
-
- int16x8_t a0123 =
- vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0];
-
- *b = vreinterpretq_s8_s16(a0123);
-}
-
-static inline void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
- int8x8_t a3, int8x16_t *b0,
- int8x16_t *b1) {
- // Transpose 8-bit elements and concatenate result rows as follows:
- // a0: 00, 01, 02, 03, 04, 05, 06, 07
- // a1: 10, 11, 12, 13, 14, 15, 16, 17
- // a2: 20, 21, 22, 23, 24, 25, 26, 27
- // a3: 30, 31, 32, 33, 34, 35, 36, 37
- //
- // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
-
- int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
- int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
- int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
- int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
-
- int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
- int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
-
- int16x8x2_t a0123 =
- vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23));
-
- *b0 = vreinterpretq_s8_s16(a0123.val[0]);
- *b1 = vreinterpretq_s8_s16(a0123.val[1]);
-}
-
static inline int16x4_t convolve12_4_y(const int8x16_t s0, const int8x16_t s1,
const int8x16_t s2,
const int8x8_t filters_0_7,
@@ -505,14 +455,14 @@
int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128)));
int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A;
- transpose_concat_4x4(s0, s1, s2, s3, &s0123);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456);
- transpose_concat_4x4(s4, s5, s6, s7, &s4567);
- transpose_concat_4x4(s5, s6, s7, s8, &s5678);
- transpose_concat_4x4(s6, s7, s8, s9, &s6789);
- transpose_concat_4x4(s7, s8, s9, sA, &s789A);
+ transpose_concat_elems_s8_4x4(s0, s1, s2, s3, &s0123);
+ transpose_concat_elems_s8_4x4(s1, s2, s3, s4, &s1234);
+ transpose_concat_elems_s8_4x4(s2, s3, s4, s5, &s2345);
+ transpose_concat_elems_s8_4x4(s3, s4, s5, s6, &s3456);
+ transpose_concat_elems_s8_4x4(s4, s5, s6, s7, &s4567);
+ transpose_concat_elems_s8_4x4(s5, s6, s7, s8, &s5678);
+ transpose_concat_elems_s8_4x4(s6, s7, s8, s9, &s6789);
+ transpose_concat_elems_s8_4x4(s7, s8, s9, sA, &s789A);
do {
uint8x8_t tB, tC, tD, tE;
@@ -524,7 +474,7 @@
int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128)));
int8x16_t s89AB, s9ABC, sABCD, sBCDE;
- transpose_concat_4x4(sB, sC, sD, sE, &sBCDE);
+ transpose_concat_elems_s8_4x4(sB, sC, sD, sE, &sBCDE);
// Merge new data into block from previous iteration.
int8x16x2_t samples_LUT = { { s789A, sBCDE } };
@@ -591,14 +541,14 @@
int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
s6789_hi, s789A_lo, s789A_hi;
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
- transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
- transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
- transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
- transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
+ transpose_concat_elems_s8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+ transpose_concat_elems_s8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+ transpose_concat_elems_s8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+ transpose_concat_elems_s8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+ transpose_concat_elems_s8_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
+ transpose_concat_elems_s8_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
+ transpose_concat_elems_s8_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
+ transpose_concat_elems_s8_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
do {
uint8x8_t tB, tC, tD, tE;
@@ -611,7 +561,7 @@
int8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi,
sBCDE_lo, sBCDE_hi;
- transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
+ transpose_concat_elems_s8_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
// Merge new data into block from previous iteration.
int8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } };
@@ -723,10 +673,10 @@
int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
int8x16_t s0123, s1234, s2345, s3456;
- transpose_concat_4x4(s0, s1, s2, s3, &s0123);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456);
+ transpose_concat_elems_s8_4x4(s0, s1, s2, s3, &s0123);
+ transpose_concat_elems_s8_4x4(s1, s2, s3, s4, &s1234);
+ transpose_concat_elems_s8_4x4(s2, s3, s4, s5, &s2345);
+ transpose_concat_elems_s8_4x4(s3, s4, s5, s6, &s3456);
do {
uint8x8_t t7, t8, t9, t10;
@@ -738,7 +688,7 @@
int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
int8x16_t s4567, s5678, s6789, s78910;
- transpose_concat_4x4(s7, s8, s9, s10, &s78910);
+ transpose_concat_elems_s8_4x4(s7, s8, s9, s10, &s78910);
// Merge new data into block from previous iteration.
int8x16x2_t samples_LUT = { { s3456, s78910 } };
@@ -791,10 +741,10 @@
// product.
int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
s3456_lo, s3456_hi;
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+ transpose_concat_elems_s8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+ transpose_concat_elems_s8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+ transpose_concat_elems_s8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+ transpose_concat_elems_s8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
do {
uint8x8_t t7, t8, t9, t10;
@@ -807,7 +757,7 @@
int8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
s78910_lo, s78910_hi;
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
+ transpose_concat_elems_s8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
// Merge new data into block from previous iteration.
int8x16x2_t samples_LUT_lo = { { s3456_lo, s78910_lo } };
diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index acd912e..9f58fae 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c
@@ -16,6 +16,7 @@
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
#include "aom_ports/mem.h"
#include "av1/common/arm/convolve_neon.h"
#include "av1/common/arm/convolve_neon_i8mm.h"
@@ -352,58 +353,6 @@
x_filter_ptr, horiz_const);
}
-static inline void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
- uint8x8_t a2, uint8x8_t a3,
- uint8x16_t *b) {
- // Transpose 8-bit elements and concatenate result rows as follows:
- // a0: 00, 01, 02, 03, XX, XX, XX, XX
- // a1: 10, 11, 12, 13, XX, XX, XX, XX
- // a2: 20, 21, 22, 23, XX, XX, XX, XX
- // a3: 30, 31, 32, 33, XX, XX, XX, XX
- //
- // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-
- uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
- uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
- uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
- uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
-
- uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
- uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
-
- uint16x8_t a0123 =
- vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0];
-
- *b = vreinterpretq_u8_u16(a0123);
-}
-
-static inline void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
- uint8x8_t a2, uint8x8_t a3,
- uint8x16_t *b0, uint8x16_t *b1) {
- // Transpose 8-bit elements and concatenate result rows as follows:
- // a0: 00, 01, 02, 03, 04, 05, 06, 07
- // a1: 10, 11, 12, 13, 14, 15, 16, 17
- // a2: 20, 21, 22, 23, 24, 25, 26, 27
- // a3: 30, 31, 32, 33, 34, 35, 36, 37
- //
- // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
-
- uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
- uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
- uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
- uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
-
- uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
- uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
-
- uint16x8x2_t a0123 =
- vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23));
-
- *b0 = vreinterpretq_u8_u16(a0123.val[0]);
- *b1 = vreinterpretq_u8_u16(a0123.val[1]);
-}
-
static inline int16x4_t convolve12_4_y(const uint8x16_t s0, const uint8x16_t s1,
const uint8x16_t s2,
const int8x8_t filters_0_7,
@@ -455,21 +404,21 @@
// This operation combines a conventional transpose and the sample permute
// (see horizontal case) required before computing the dot product.
uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A;
- transpose_concat_4x4(s0, s1, s2, s3, &s0123);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456);
- transpose_concat_4x4(s4, s5, s6, s7, &s4567);
- transpose_concat_4x4(s5, s6, s7, s8, &s5678);
- transpose_concat_4x4(s6, s7, s8, s9, &s6789);
- transpose_concat_4x4(s7, s8, s9, sA, &s789A);
+ transpose_concat_elems_u8_4x4(s0, s1, s2, s3, &s0123);
+ transpose_concat_elems_u8_4x4(s1, s2, s3, s4, &s1234);
+ transpose_concat_elems_u8_4x4(s2, s3, s4, s5, &s2345);
+ transpose_concat_elems_u8_4x4(s3, s4, s5, s6, &s3456);
+ transpose_concat_elems_u8_4x4(s4, s5, s6, s7, &s4567);
+ transpose_concat_elems_u8_4x4(s5, s6, s7, s8, &s5678);
+ transpose_concat_elems_u8_4x4(s6, s7, s8, s9, &s6789);
+ transpose_concat_elems_u8_4x4(s7, s8, s9, sA, &s789A);
do {
uint8x8_t sB, sC, sD, sE;
load_u8_8x4(src_ptr, src_stride, &sB, &sC, &sD, &sE);
uint8x16_t s89AB, s9ABC, sABCD, sBCDE;
- transpose_concat_4x4(sB, sC, sD, sE, &sBCDE);
+ transpose_concat_elems_u8_4x4(sB, sC, sD, sE, &sBCDE);
// Merge new data into block from previous iteration.
uint8x16x2_t samples_LUT = { { s789A, sBCDE } };
@@ -523,14 +472,14 @@
uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
s6789_hi, s789A_lo, s789A_hi;
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
- transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
- transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
- transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
- transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
+ transpose_concat_elems_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+ transpose_concat_elems_u8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+ transpose_concat_elems_u8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+ transpose_concat_elems_u8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+ transpose_concat_elems_u8_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
+ transpose_concat_elems_u8_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
+ transpose_concat_elems_u8_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
+ transpose_concat_elems_u8_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
do {
uint8x8_t sB, sC, sD, sE;
@@ -538,7 +487,7 @@
uint8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi,
sBCDE_lo, sBCDE_hi;
- transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
+ transpose_concat_elems_u8_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
// Merge new data into block from previous iteration.
uint8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } };
@@ -638,17 +587,17 @@
// This operation combines a conventional transpose and the sample permute
// (see horizontal case) required before computing the dot product.
uint8x16_t s0123, s1234, s2345, s3456;
- transpose_concat_4x4(s0, s1, s2, s3, &s0123);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456);
+ transpose_concat_elems_u8_4x4(s0, s1, s2, s3, &s0123);
+ transpose_concat_elems_u8_4x4(s1, s2, s3, s4, &s1234);
+ transpose_concat_elems_u8_4x4(s2, s3, s4, s5, &s2345);
+ transpose_concat_elems_u8_4x4(s3, s4, s5, s6, &s3456);
do {
uint8x8_t s7, s8, s9, s10;
load_u8_8x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
uint8x16_t s4567, s5678, s6789, s78910;
- transpose_concat_4x4(s7, s8, s9, s10, &s78910);
+ transpose_concat_elems_u8_4x4(s7, s8, s9, s10, &s78910);
// Merge new data into block from previous iteration.
uint8x16x2_t samples_LUT = { { s3456, s78910 } };
@@ -692,10 +641,10 @@
// product.
uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
s3456_lo, s3456_hi;
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+ transpose_concat_elems_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+ transpose_concat_elems_u8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+ transpose_concat_elems_u8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+ transpose_concat_elems_u8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
do {
uint8x8_t s7, s8, s9, s10;
@@ -703,7 +652,7 @@
uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
s78910_lo, s78910_hi;
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
+ transpose_concat_elems_u8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
// Merge new data into block from previous iteration.
uint8x16x2_t samples_LUT_lo = { { s3456_lo, s78910_lo } };
diff --git a/av1/common/arm/convolve_sve2.c b/av1/common/arm/convolve_sve2.c
index 536f441..3cda7d7 100644
--- a/av1/common/arm/convolve_sve2.c
+++ b/av1/common/arm/convolve_sve2.c
@@ -81,21 +81,21 @@
s6789[2], s789A[2];
// This operation combines a conventional transpose and the sample permute
// required before computing the dot product.
- transpose_concat_4x4(s0, s1, s2, s3, s0123);
- transpose_concat_4x4(s1, s2, s3, s4, s1234);
- transpose_concat_4x4(s2, s3, s4, s5, s2345);
- transpose_concat_4x4(s3, s4, s5, s6, s3456);
- transpose_concat_4x4(s4, s5, s6, s7, s4567);
- transpose_concat_4x4(s5, s6, s7, s8, s5678);
- transpose_concat_4x4(s6, s7, s8, s9, s6789);
- transpose_concat_4x4(s7, s8, s9, sA, s789A);
+ transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123);
+ transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234);
+ transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345);
+ transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456);
+ transpose_concat_elems_s16_4x4(s4, s5, s6, s7, s4567);
+ transpose_concat_elems_s16_4x4(s5, s6, s7, s8, s5678);
+ transpose_concat_elems_s16_4x4(s6, s7, s8, s9, s6789);
+ transpose_concat_elems_s16_4x4(s7, s8, s9, sA, s789A);
do {
int16x4_t sB, sC, sD, sE;
load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE);
int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2];
- transpose_concat_4x4(sB, sC, sD, sE, sBCDE);
+ transpose_concat_elems_s16_4x4(sB, sC, sD, sE, sBCDE);
// Merge new data into block from previous iteration.
aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB);
diff --git a/av1/common/arm/highbd_compound_convolve_sve2.c b/av1/common/arm/highbd_compound_convolve_sve2.c
index 668dfbf..493f218 100644
--- a/av1/common/arm/highbd_compound_convolve_sve2.c
+++ b/av1/common/arm/highbd_compound_convolve_sve2.c
@@ -506,10 +506,10 @@
// This operation combines a conventional transpose and the sample permute
// required before computing the dot product.
int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
- transpose_concat_4x4(s0, s1, s2, s3, s0123);
- transpose_concat_4x4(s1, s2, s3, s4, s1234);
- transpose_concat_4x4(s2, s3, s4, s5, s2345);
- transpose_concat_4x4(s3, s4, s5, s6, s3456);
+ transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123);
+ transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234);
+ transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345);
+ transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456);
do {
int16x4_t s7, s8, s9, s10;
@@ -517,7 +517,7 @@
int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
// Transpose and shuffle the 4 lines that were loaded.
- transpose_concat_4x4(s7, s8, s9, s10, s789A);
+ transpose_concat_elems_s16_4x4(s7, s8, s9, s10, s789A);
// Merge new data into block from previous iteration.
aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
@@ -559,10 +559,10 @@
// This operation combines a conventional transpose and the sample permute
// required before computing the dot product.
int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
- transpose_concat_8x4(s0, s1, s2, s3, s0123);
- transpose_concat_8x4(s1, s2, s3, s4, s1234);
- transpose_concat_8x4(s2, s3, s4, s5, s2345);
- transpose_concat_8x4(s3, s4, s5, s6, s3456);
+ transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123);
+ transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234);
+ transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345);
+ transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456);
do {
int16x8_t s7, s8, s9, s10;
@@ -570,7 +570,7 @@
int16x8_t s4567[4], s5678[4], s6789[4], s789A[4];
// Transpose and shuffle the 4 lines that were loaded.
- transpose_concat_8x4(s7, s8, s9, s10, s789A);
+ transpose_concat_elems_s16_8x4(s7, s8, s9, s10, s789A);
// Merge new data into block from previous iteration.
aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
@@ -682,10 +682,10 @@
// This operation combines a conventional transpose and the sample permute
// required before computing the dot product.
int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
- transpose_concat_4x4(s0, s1, s2, s3, s0123);
- transpose_concat_4x4(s1, s2, s3, s4, s1234);
- transpose_concat_4x4(s2, s3, s4, s5, s2345);
- transpose_concat_4x4(s3, s4, s5, s6, s3456);
+ transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123);
+ transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234);
+ transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345);
+ transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456);
do {
int16x4_t s7, s8, s9, s10;
@@ -693,7 +693,7 @@
int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
// Transpose and shuffle the 4 lines that were loaded.
- transpose_concat_4x4(s7, s8, s9, s10, s789A);
+ transpose_concat_elems_s16_4x4(s7, s8, s9, s10, s789A);
// Merge new data into block from previous iteration.
aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
@@ -735,10 +735,10 @@
// This operation combines a conventional transpose and the sample permute
// required before computing the dot product.
int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
- transpose_concat_8x4(s0, s1, s2, s3, s0123);
- transpose_concat_8x4(s1, s2, s3, s4, s1234);
- transpose_concat_8x4(s2, s3, s4, s5, s2345);
- transpose_concat_8x4(s3, s4, s5, s6, s3456);
+ transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123);
+ transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234);
+ transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345);
+ transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456);
do {
int16x8_t s7, s8, s9, s10;
@@ -746,7 +746,7 @@
int16x8_t s4567[4], s5678[4], s6789[4], s789A[4];
// Transpose and shuffle the 4 lines that were loaded.
- transpose_concat_8x4(s7, s8, s9, s10, s789A);
+ transpose_concat_elems_s16_8x4(s7, s8, s9, s10, s789A);
// Merge new data into block from previous iteration.
aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
@@ -1234,10 +1234,10 @@
// This operation combines a conventional transpose and the sample permute
// required before computing the dot product.
int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
- transpose_concat_4x4(s0, s1, s2, s3, s0123);
- transpose_concat_4x4(s1, s2, s3, s4, s1234);
- transpose_concat_4x4(s2, s3, s4, s5, s2345);
- transpose_concat_4x4(s3, s4, s5, s6, s3456);
+ transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123);
+ transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234);
+ transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345);
+ transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456);
do {
int16x4_t s7, s8, s9, s10;
@@ -1245,7 +1245,7 @@
int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
// Transpose and shuffle the 4 lines that were loaded.
- transpose_concat_4x4(s7, s8, s9, s10, s789A);
+ transpose_concat_elems_s16_4x4(s7, s8, s9, s10, s789A);
// Merge new data into block from previous iteration.
aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
@@ -1291,10 +1291,10 @@
// This operation combines a conventional transpose and the sample permute
// required before computing the dot product.
int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
- transpose_concat_8x4(s0, s1, s2, s3, s0123);
- transpose_concat_8x4(s1, s2, s3, s4, s1234);
- transpose_concat_8x4(s2, s3, s4, s5, s2345);
- transpose_concat_8x4(s3, s4, s5, s6, s3456);
+ transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123);
+ transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234);
+ transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345);
+ transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456);
do {
int16x8_t s7, s8, s9, s10;
@@ -1302,7 +1302,7 @@
int16x8_t s4567[4], s5678[4], s6789[4], s789A[4];
// Transpose and shuffle the 4 lines that were loaded.
- transpose_concat_8x4(s7, s8, s9, s10, s789A);
+ transpose_concat_elems_s16_8x4(s7, s8, s9, s10, s789A);
// Merge new data into block from previous iteration.
aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
diff --git a/av1/common/arm/highbd_convolve_sve2.c b/av1/common/arm/highbd_convolve_sve2.c
index fcf9d7b..8ce6021 100644
--- a/av1/common/arm/highbd_convolve_sve2.c
+++ b/av1/common/arm/highbd_convolve_sve2.c
@@ -19,6 +19,7 @@
#include "aom_dsp/arm/aom_neon_sve_bridge.h"
#include "aom_dsp/arm/aom_neon_sve2_bridge.h"
#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
#include "aom_ports/mem.h"
#include "av1/common/convolve.h"
#include "av1/common/filter.h"
@@ -456,21 +457,21 @@
int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2],
s6789[2], s789A[2];
- transpose_concat_4x4(s0, s1, s2, s3, s0123);
- transpose_concat_4x4(s1, s2, s3, s4, s1234);
- transpose_concat_4x4(s2, s3, s4, s5, s2345);
- transpose_concat_4x4(s3, s4, s5, s6, s3456);
- transpose_concat_4x4(s4, s5, s6, s7, s4567);
- transpose_concat_4x4(s5, s6, s7, s8, s5678);
- transpose_concat_4x4(s6, s7, s8, s9, s6789);
- transpose_concat_4x4(s7, s8, s9, sA, s789A);
+ transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123);
+ transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234);
+ transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345);
+ transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456);
+ transpose_concat_elems_s16_4x4(s4, s5, s6, s7, s4567);
+ transpose_concat_elems_s16_4x4(s5, s6, s7, s8, s5678);
+ transpose_concat_elems_s16_4x4(s6, s7, s8, s9, s6789);
+ transpose_concat_elems_s16_4x4(s7, s8, s9, sA, s789A);
do {
int16x4_t sB, sC, sD, sE;
load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE);
int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2];
- transpose_concat_4x4(sB, sC, sD, sE, sBCDE);
+ transpose_concat_elems_s16_4x4(sB, sC, sD, sE, sBCDE);
// Use the above transpose and reuse data from the previous loop to get
// the rest.
@@ -597,10 +598,10 @@
// This operation combines a conventional transpose and the sample permute
// required before computing the dot product.
int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
- transpose_concat_4x4(s0, s1, s2, s3, s0123);
- transpose_concat_4x4(s1, s2, s3, s4, s1234);
- transpose_concat_4x4(s2, s3, s4, s5, s2345);
- transpose_concat_4x4(s3, s4, s5, s6, s3456);
+ transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123);
+ transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234);
+ transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345);
+ transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456);
do {
int16x4_t s7, s8, s9, s10;
@@ -608,7 +609,7 @@
int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
// Transpose and shuffle the 4 lines that were loaded.
- transpose_concat_4x4(s7, s8, s9, s10, s789A);
+ transpose_concat_elems_s16_4x4(s7, s8, s9, s10, s789A);
// Merge new data into block from previous iteration.
aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
@@ -651,10 +652,10 @@
// This operation combines a conventional transpose and the sample permute
// required before computing the dot product.
int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
- transpose_concat_8x4(s0, s1, s2, s3, s0123);
- transpose_concat_8x4(s1, s2, s3, s4, s1234);
- transpose_concat_8x4(s2, s3, s4, s5, s2345);
- transpose_concat_8x4(s3, s4, s5, s6, s3456);
+ transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123);
+ transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234);
+ transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345);
+ transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456);
do {
int16x8_t s7, s8, s9, s10;
@@ -662,7 +663,7 @@
int16x8_t s4567[4], s5678[4], s6789[4], s789A[4];
// Transpose and shuffle the 4 lines that were loaded.
- transpose_concat_8x4(s7, s8, s9, s10, s789A);
+ transpose_concat_elems_s16_8x4(s7, s8, s9, s10, s789A);
// Merge new data into block from previous iteration.
aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
@@ -757,10 +758,10 @@
// This operation combines a conventional transpose and the sample permute
// required before computing the dot product.
int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
- transpose_concat_4x4(s0, s1, s2, s3, s0123);
- transpose_concat_4x4(s1, s2, s3, s4, s1234);
- transpose_concat_4x4(s2, s3, s4, s5, s2345);
- transpose_concat_4x4(s3, s4, s5, s6, s3456);
+ transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123);
+ transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234);
+ transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345);
+ transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456);
uint16x4_t d0 = highbd_convolve4_4_y(s0123, y_filter, max);
uint16x4_t d1 = highbd_convolve4_4_y(s1234, y_filter, max);
@@ -797,10 +798,10 @@
// This operation combines a conventional transpose and the sample
// permute required before computing the dot product.
int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
- transpose_concat_8x4(s0, s1, s2, s3, s0123);
- transpose_concat_8x4(s1, s2, s3, s4, s1234);
- transpose_concat_8x4(s2, s3, s4, s5, s2345);
- transpose_concat_8x4(s3, s4, s5, s6, s3456);
+ transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123);
+ transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234);
+ transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345);
+ transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456);
uint16x8_t d0 = highbd_convolve4_8_y(s0123, y_filter, max);
uint16x8_t d1 = highbd_convolve4_8_y(s1234, y_filter, max);
@@ -1245,21 +1246,21 @@
s6789[2], s789A[2];
// This operation combines a conventional transpose and the sample permute
// required before computing the dot product.
- transpose_concat_4x4(s0, s1, s2, s3, s0123);
- transpose_concat_4x4(s1, s2, s3, s4, s1234);
- transpose_concat_4x4(s2, s3, s4, s5, s2345);
- transpose_concat_4x4(s3, s4, s5, s6, s3456);
- transpose_concat_4x4(s4, s5, s6, s7, s4567);
- transpose_concat_4x4(s5, s6, s7, s8, s5678);
- transpose_concat_4x4(s6, s7, s8, s9, s6789);
- transpose_concat_4x4(s7, s8, s9, sA, s789A);
+ transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123);
+ transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234);
+ transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345);
+ transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456);
+ transpose_concat_elems_s16_4x4(s4, s5, s6, s7, s4567);
+ transpose_concat_elems_s16_4x4(s5, s6, s7, s8, s5678);
+ transpose_concat_elems_s16_4x4(s6, s7, s8, s9, s6789);
+ transpose_concat_elems_s16_4x4(s7, s8, s9, sA, s789A);
do {
int16x4_t sB, sC, sD, sE;
load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE);
int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2];
- transpose_concat_4x4(sB, sC, sD, sE, sBCDE);
+ transpose_concat_elems_s16_4x4(sB, sC, sD, sE, sBCDE);
// Use the above transpose and reuse data from the previous loop to get
// the rest.
@@ -1383,10 +1384,10 @@
// This operation combines a conventional transpose and the sample permute
// required before computing the dot product.
int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
- transpose_concat_4x4(s0, s1, s2, s3, s0123);
- transpose_concat_4x4(s1, s2, s3, s4, s1234);
- transpose_concat_4x4(s2, s3, s4, s5, s2345);
- transpose_concat_4x4(s3, s4, s5, s6, s3456);
+ transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123);
+ transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234);
+ transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345);
+ transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456);
do {
int16x4_t s7, s8, s9, s10;
@@ -1394,7 +1395,7 @@
int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
// Transpose and shuffle the 4 lines that were loaded.
- transpose_concat_4x4(s7, s8, s9, s10, s789A);
+ transpose_concat_elems_s16_4x4(s7, s8, s9, s10, s789A);
// Merge new data into block from previous iteration.
aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
@@ -1442,10 +1443,10 @@
// This operation combines a conventional transpose and the sample permute
// required before computing the dot product.
int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
- transpose_concat_8x4(s0, s1, s2, s3, s0123);
- transpose_concat_8x4(s1, s2, s3, s4, s1234);
- transpose_concat_8x4(s2, s3, s4, s5, s2345);
- transpose_concat_8x4(s3, s4, s5, s6, s3456);
+ transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123);
+ transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234);
+ transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345);
+ transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456);
do {
int16x8_t s7, s8, s9, s10;
@@ -1453,7 +1454,7 @@
int16x8_t s4567[4], s5678[4], s6789[4], s789A[4];
// Transpose and shuffle the 4 lines that were loaded.
- transpose_concat_8x4(s7, s8, s9, s10, s789A);
+ transpose_concat_elems_s16_8x4(s7, s8, s9, s10, s789A);
// Merge new data into block from previous iteration.
aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
@@ -1562,10 +1563,10 @@
// This operation combines a conventional transpose and the sample permute
// required before computing the dot product.
int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
- transpose_concat_4x4(s0, s1, s2, s3, s0123);
- transpose_concat_4x4(s1, s2, s3, s4, s1234);
- transpose_concat_4x4(s2, s3, s4, s5, s2345);
- transpose_concat_4x4(s3, s4, s5, s6, s3456);
+ transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123);
+ transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234);
+ transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345);
+ transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456);
uint16x4_t d0 =
highbd_convolve4_4_2d_v(s0123, y_filter, shift, offset, max);
@@ -1606,10 +1607,10 @@
// This operation combines a conventional transpose and the sample
// permute required before computing the dot product.
int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
- transpose_concat_8x4(s0, s1, s2, s3, s0123);
- transpose_concat_8x4(s1, s2, s3, s4, s1234);
- transpose_concat_8x4(s2, s3, s4, s5, s2345);
- transpose_concat_8x4(s3, s4, s5, s6, s3456);
+ transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123);
+ transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234);
+ transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345);
+ transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456);
uint16x8_t d0 =
highbd_convolve4_8_2d_v(s0123, y_filter, shift, offset, max);
diff --git a/av1/common/arm/highbd_convolve_sve2.h b/av1/common/arm/highbd_convolve_sve2.h
index abbad14..40ba2cd 100644
--- a/av1/common/arm/highbd_convolve_sve2.h
+++ b/av1/common/arm/highbd_convolve_sve2.h
@@ -27,59 +27,6 @@
};
// clang-format on
-static inline void transpose_concat_4x4(int16x4_t s0, int16x4_t s1,
- int16x4_t s2, int16x4_t s3,
- int16x8_t res[2]) {
- // Transpose 16-bit elements and concatenate result rows as follows:
- // s0: 00, 01, 02, 03
- // s1: 10, 11, 12, 13
- // s2: 20, 21, 22, 23
- // s3: 30, 31, 32, 33
- //
- // res[0]: 00 10 20 30 01 11 21 31
- // res[1]: 02 12 22 32 03 13 23 33
-
- int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0));
- int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0));
- int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0));
- int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0));
-
- int32x4_t s01 = vreinterpretq_s32_s16(vzip1q_s16(s0q, s1q));
- int32x4_t s23 = vreinterpretq_s32_s16(vzip1q_s16(s2q, s3q));
-
- int32x4x2_t s0123 = vzipq_s32(s01, s23);
-
- res[0] = vreinterpretq_s16_s32(s0123.val[0]);
- res[1] = vreinterpretq_s16_s32(s0123.val[1]);
-}
-
-static inline void transpose_concat_8x4(int16x8_t s0, int16x8_t s1,
- int16x8_t s2, int16x8_t s3,
- int16x8_t res[4]) {
- // Transpose 16-bit elements and concatenate result rows as follows:
- // s0: 00, 01, 02, 03, 04, 05, 06, 07
- // s1: 10, 11, 12, 13, 14, 15, 16, 17
- // s2: 20, 21, 22, 23, 24, 25, 26, 27
- // s3: 30, 31, 32, 33, 34, 35, 36, 37
- //
- // res[0]: 00 10 20 30 01 11 21 31
- // res[1]: 02 12 22 32 03 13 23 33
- // res[2]: 04 14 24 34 05 15 25 35
- // res[3]: 06 16 26 36 07 17 27 37
-
- int16x8x2_t tr01_16 = vzipq_s16(s0, s1);
- int16x8x2_t tr23_16 = vzipq_s16(s2, s3);
- int32x4x2_t tr01_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[0]),
- vreinterpretq_s32_s16(tr23_16.val[0]));
- int32x4x2_t tr23_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[1]),
- vreinterpretq_s32_s16(tr23_16.val[1]));
-
- res[0] = vreinterpretq_s16_s32(tr01_32.val[0]);
- res[1] = vreinterpretq_s16_s32(tr01_32.val[1]);
- res[2] = vreinterpretq_s16_s32(tr23_32.val[0]);
- res[3] = vreinterpretq_s16_s32(tr23_32.val[1]);
-}
-
static inline void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4],
uint16x8_t tbl, int16x8_t res[4]) {
res[0] = aom_tbl2_s16(t0[0], t1[0], tbl);