Don't read 8 bytes from a 3-byte array The elements of the av1_fwd_txfm_shift_ls array point to 3-byte (global) arrays. It is unsafe to read from a 3-byte array using vld1_s8(), which reads 8 bytes. This bug was introduced when av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c was added in https://aomedia-review.googlesource.com/c/aom/+/110221. ASan reported global-buffer-overflow errors. Change-Id: I53f06dba6b6be3e7216c4f2359e1060cf9d7fd87 (cherry picked from commit 582d2fd1e9b6a212cb7d30bcf63d3c1e78aa8fca)
diff --git a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c index ed58681..8a282b3 100644 --- a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c +++ b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
@@ -2307,10 +2307,9 @@ int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); - const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0]))); - const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0); - const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1); - const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2); + const int16x8_t v_shift0 = vdupq_n_s16(shift[0]); + const int16x8_t v_shift1 = vdupq_n_s16(shift[1]); + const int16x8_t v_shift2 = vdupq_n_s16(shift[2]); if (ud_flip) { load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); } else { @@ -2351,10 +2350,9 @@ int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); - const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0]))); - const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0); - const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1); - const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2); + const int16x8_t v_shift0 = vdupq_n_s16(shift[0]); + const int16x8_t v_shift1 = vdupq_n_s16(shift[1]); + const int16x8_t v_shift2 = vdupq_n_s16(shift[2]); if (ud_flip) { load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); } else { @@ -2393,10 +2391,9 @@ int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); - const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0]))); - const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0); - const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1); - const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2); + const int16x8_t v_shift0 = vdupq_n_s16(shift[0]); + const int16x8_t v_shift1 = vdupq_n_s16(shift[1]); + const int16x8_t v_shift2 = vdupq_n_s16(shift[2]); if (ud_flip) { load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); } else { @@ -2439,10 +2436,9 @@ int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); - const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0]))); - const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0); - const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1); - const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2); + const int16x8_t v_shift0 = vdupq_n_s16(shift[0]); + const int16x8_t v_shift1 = vdupq_n_s16(shift[1]); + const int16x8_t v_shift2 = vdupq_n_s16(shift[2]); if (ud_flip) load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); else @@ -2480,10 +2476,9 @@ int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); - const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0]))); - const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0); - const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1); - const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2); + const int16x8_t v_shift0 = vdupq_n_s16(shift[0]); + const int16x8_t v_shift1 = vdupq_n_s16(shift[1]); + const int16x8_t v_shift2 = vdupq_n_s16(shift[2]); if (ud_flip) load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); else @@ -2521,10 +2516,9 @@ int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); - const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0]))); - const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0); - const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1); - const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2); + const int16x8_t v_shift0 = vdupq_n_s16(shift[0]); + const int16x8_t v_shift1 = vdupq_n_s16(shift[1]); + const int16x8_t v_shift2 = vdupq_n_s16(shift[2]); if (ud_flip) { load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); } else { @@ -2567,10 +2561,9 @@ int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); - const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0]))); - const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0); - const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1); - const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2); + const int16x8_t v_shift0 = vdupq_n_s16(shift[0]); + const int16x8_t v_shift1 = vdupq_n_s16(shift[1]); + const int16x8_t v_shift2 = vdupq_n_s16(shift[2]); if (ud_flip) { load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); } else { @@ -2616,10 +2609,9 @@ int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); - const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0]))); - const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0); - const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1); - const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2); + const int16x8_t v_shift0 = vdupq_n_s16(shift[0]); + const int16x8_t v_shift1 = vdupq_n_s16(shift[1]); + const int16x8_t v_shift2 = vdupq_n_s16(shift[2]); for (int i = 0; i < 2; i++) { if (ud_flip) { load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); @@ -2663,10 +2655,9 @@ int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); - const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0]))); - const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0); - const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1); - const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2); + const int16x8_t v_shift0 = vdupq_n_s16(shift[0]); + const int16x8_t v_shift1 = vdupq_n_s16(shift[1]); + const int16x8_t v_shift2 = vdupq_n_s16(shift[2]); for (int i = 0; i < 2; i++) { if (ud_flip) { load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); @@ -2709,10 +2700,9 @@ int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); - const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0]))); - const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0); - const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1); - const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2); + const int16x8_t v_shift0 = vdupq_n_s16(shift[0]); + const int16x8_t v_shift1 = vdupq_n_s16(shift[1]); + const int16x8_t v_shift2 = vdupq_n_s16(shift[2]); for (int i = 0; i < 2; i++) { if (ud_flip) { @@ -2762,10 +2752,9 @@ if (col_txfm != NULL && row_txfm != NULL) { int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); - const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0]))); - const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0); - const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1); - const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2); + const int16x8_t v_shift0 = vdupq_n_s16(shift[0]); + const int16x8_t v_shift1 = vdupq_n_s16(shift[1]); + const int16x8_t v_shift2 = vdupq_n_s16(shift[2]); for (int i = 0; i < 2; i++) { if (ud_flip) { @@ -2821,10 +2810,9 @@ if (col_txfm != NULL && row_txfm != NULL) { int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); - const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0]))); - const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0); - const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1); - const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2); + const int16x8_t v_shift0 = vdupq_n_s16(shift[0]); + const int16x8_t v_shift1 = vdupq_n_s16(shift[1]); + const int16x8_t v_shift2 = vdupq_n_s16(shift[2]); for (int i = 0; i < 4; i++) { if (ud_flip) { @@ -2881,10 +2869,9 @@ const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type]; if (col_txfm != NULL && row_txfm != NULL) { - const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0]))); - const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0); - const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1); - const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2); + const int16x8_t v_shift0 = vdupq_n_s16(shift[0]); + const int16x8_t v_shift1 = vdupq_n_s16(shift[1]); + const int16x8_t v_shift2 = vdupq_n_s16(shift[2]); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip);