av1/encoder/x86/av1_fwd_txfm2d_avx2.c - avm - Git at Google

 /*
  * Copyright (c) 2021, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 3-Clause Clear License
  * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
  * License was not distributed with this source code in the LICENSE file, you
  * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
  * Alliance for Open Media Patent License 1.0 was not distributed with this
  * source code in the PATENTS file, you can obtain it at
  * aomedia.org/license/patent-license/.
  */

 #include "config/av1_rtcd.h"

 #include "av1/common/enums.h"
 #include "av1/common/av1_txfm.h"
 #include "av1/common/idct.h"
 #include "av1/encoder/x86/av1_fwd_txfm_avx2.h"

 #if CONFIG_IMPROVE_LOSSLESS_TXM
 static INLINE __m256i load_4x4_s16_avx2(const int16_t *src, int stride) {
   // Cast src pointer to handle byte-level stride arithmetic
   const uint8_t *src_bytes = (const uint8_t *)src;
   const ptrdiff_t stride_bytes = stride * 2;

   // 1. Load each 8-byte row into the lower 64 bits of a __m128i register.
   // _mm_loadl_epi64 handles potential unaligned access.
   __m128i r0 = _mm_loadl_epi64(
       (const __m128i *)(src_bytes + 0 * stride_bytes));  // Row 0
   __m128i r1 = _mm_loadl_epi64(
       (const __m128i *)(src_bytes + 1 * stride_bytes));  // Row 1
   __m128i r2 = _mm_loadl_epi64(
       (const __m128i *)(src_bytes + 2 * stride_bytes));  // Row 2
   __m128i r3 = _mm_loadl_epi64(
       (const __m128i *)(src_bytes + 3 * stride_bytes));  // Row 3
   // Example: r0 contains [Row0_Bytes7..0 | 0x00... ] (128 bits total)

   // 2. Combine pairs of rows (r0, r1) and (r2, r3) into 128-bit registers.
   // _mm_unpacklo_epi64 interleaves the low 64 bits.
   __m128i low128 =
       _mm_unpacklo_epi64(r0, r1);  // low128 = [Row1_Bytes7..0 | Row0_Bytes7..0]
   __m128i high128 = _mm_unpacklo_epi64(
       r2, r3);  // high128 = [Row3_Bytes7..0 | Row2_Bytes7..0]

   // 3. Assemble the full 256-bit register from the two 128-bit lanes.
   // _mm256_set_m128i expects arguments in (hi, lo) order.
   __m256i block256 = _mm256_set_m128i(high128, low128);
   // block256 = [Row3 | Row2 | Row1 | Row0]

   return block256;
 }

 static INLINE void store_4x4_s16_s32_avx2(__m256i data_s16, int32_t *dst,
                                           int stride) {
   const ptrdiff_t stride_bytes = stride * 4;
   // --- Step 1: Convert 16x s16 -> 16x s32 (results in two __m256i) ---

   // Isolate the lower 128 bits (first 8 x int16_t: elements 0..7)
   __m128i low128_s16 = _mm256_castsi256_si128(data_s16);
   // Sign-extend lower 8 int16_t -> first 8 int32_t
   __m256i s32_low = _mm256_cvtepi16_epi32(low128_s16);
   // s32_low now contains [el7, el6, el5, el4 | el3, el2, el1, el0] (32-bit
   // elements)

   // Isolate the upper 128 bits (next 8 x int16_t: elements 8..15)
   __m128i high128_s16 = _mm256_extracti128_si256(data_s16, 1);
   // Sign-extend upper 8 int16_t -> next 8 int32_t
   __m256i s32_high = _mm256_cvtepi16_epi32(high128_s16);
   // s32_high now contains [el15, el14, el13, el12 | el11, el10, el9, el8]
   // (32-bit elements)

   // --- Step 2: Store the 16x s32 from the two registers into memory ---

   // Cast destination pointer for byte-level stride arithmetic
   uint8_t *dst_bytes = (uint8_t *)dst;

   // Extract Row 0 data (elements 0..3) from the lower 128 bits of s32_low
   __m128i row0_data = _mm256_castsi256_si128(s32_low);
   // Store Row 0 (4x int32_t = 16 bytes) at the base destination address.
   // Using _mm_storeu_si128 for safety with potentially unaligned destinations.
   _mm_storeu_si128((__m128i *)(dst_bytes + 0 * stride_bytes), row0_data);

   // Extract Row 1 data (elements 4..7) from the upper 128 bits of s32_low
   __m128i row1_data = _mm256_extracti128_si256(s32_low, 1);
   // Store Row 1 (16 bytes) at the destination address + 1*stride.
   _mm_storeu_si128((__m128i *)(dst_bytes + 1 * stride_bytes), row1_data);

   // Extract Row 2 data (elements 8..11) from the lower 128 bits of s32_high
   __m128i row2_data = _mm256_castsi256_si128(s32_high);
   // Store Row 2 (16 bytes) at the destination address + 2*stride.
   _mm_storeu_si128((__m128i *)(dst_bytes + 2 * stride_bytes), row2_data);

   // Extract Row 3 data (elements 12..15) from the upper 128 bits of s32_high
   __m128i row3_data = _mm256_extracti128_si256(s32_high, 1);
   // Store Row 3 (16 bytes) at the destination address + 3*stride.
   _mm_storeu_si128((__m128i *)(dst_bytes + 3 * stride_bytes), row3_data);
 }

 void av1_lossless_fwd_idtx_avx2(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TxfmParam *txfm_param) {
   const int txw = tx_size_wide[txfm_param->tx_size];
   const int txh = tx_size_high[txfm_param->tx_size];
   int scale_bits = 3 - av1_get_tx_scale(txfm_param->tx_size);

   for (int i = 0; i < txh; i += MI_SIZE) {
     for (int j = 0; j < txw; j += MI_SIZE) {
       __m256i block =
           load_4x4_s16_avx2(src_diff + i * diff_stride + j, diff_stride);
       // Perform the left shift on the 16 packed int16_t elements
       // _mm256_slli_epi16 shifts zeros in from the right.
       __m256i shifted_block = _mm256_slli_epi16(block, scale_bits);
       store_4x4_s16_s32_avx2(shifted_block, coeff + i * txw + j, txw);
     }
   }
 }
 #endif  // CONFIG_IMPROVE_LOSSLESS_TXM
	/*
	* Copyright (c) 2021, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 3-Clause Clear License
	* and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
	* License was not distributed with this source code in the LICENSE file, you
	* can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
	* Alliance for Open Media Patent License 1.0 was not distributed with this
	* source code in the PATENTS file, you can obtain it at
	* aomedia.org/license/patent-license/.
	*/

	#include "config/av1_rtcd.h"

	#include "av1/common/enums.h"
	#include "av1/common/av1_txfm.h"
	#include "av1/common/idct.h"
	#include "av1/encoder/x86/av1_fwd_txfm_avx2.h"

	#if CONFIG_IMPROVE_LOSSLESS_TXM
	static INLINE __m256i load_4x4_s16_avx2(const int16_t *src, int stride) {
	// Cast src pointer to handle byte-level stride arithmetic
	const uint8_t src_bytes = (const uint8_t )src;
	const ptrdiff_t stride_bytes = stride * 2;

	// 1. Load each 8-byte row into the lower 64 bits of a __m128i register.
	// _mm_loadl_epi64 handles potential unaligned access.
	__m128i r0 = _mm_loadl_epi64(
	(const __m128i )(src_bytes + 0 stride_bytes)); // Row 0
	__m128i r1 = _mm_loadl_epi64(
	(const __m128i )(src_bytes + 1 stride_bytes)); // Row 1
	__m128i r2 = _mm_loadl_epi64(
	(const __m128i )(src_bytes + 2 stride_bytes)); // Row 2
	__m128i r3 = _mm_loadl_epi64(
	(const __m128i )(src_bytes + 3 stride_bytes)); // Row 3
	// Example: r0 contains [Row0_Bytes7..0 \| 0x00... ] (128 bits total)

	// 2. Combine pairs of rows (r0, r1) and (r2, r3) into 128-bit registers.
	// _mm_unpacklo_epi64 interleaves the low 64 bits.
	__m128i low128 =
	_mm_unpacklo_epi64(r0, r1); // low128 = [Row1_Bytes7..0 \| Row0_Bytes7..0]
	__m128i high128 = _mm_unpacklo_epi64(
	r2, r3); // high128 = [Row3_Bytes7..0 \| Row2_Bytes7..0]

	// 3. Assemble the full 256-bit register from the two 128-bit lanes.
	// _mm256_set_m128i expects arguments in (hi, lo) order.
	__m256i block256 = _mm256_set_m128i(high128, low128);
	// block256 = [Row3 \| Row2 \| Row1 \| Row0]

	return block256;
	}

	static INLINE void store_4x4_s16_s32_avx2(__m256i data_s16, int32_t *dst,
	int stride) {
	const ptrdiff_t stride_bytes = stride * 4;
	// --- Step 1: Convert 16x s16 -> 16x s32 (results in two __m256i) ---

	// Isolate the lower 128 bits (first 8 x int16_t: elements 0..7)
	__m128i low128_s16 = _mm256_castsi256_si128(data_s16);
	// Sign-extend lower 8 int16_t -> first 8 int32_t
	__m256i s32_low = _mm256_cvtepi16_epi32(low128_s16);
	// s32_low now contains [el7, el6, el5, el4 \| el3, el2, el1, el0] (32-bit
	// elements)

	// Isolate the upper 128 bits (next 8 x int16_t: elements 8..15)
	__m128i high128_s16 = _mm256_extracti128_si256(data_s16, 1);
	// Sign-extend upper 8 int16_t -> next 8 int32_t
	__m256i s32_high = _mm256_cvtepi16_epi32(high128_s16);
	// s32_high now contains [el15, el14, el13, el12 \| el11, el10, el9, el8]
	// (32-bit elements)

	// --- Step 2: Store the 16x s32 from the two registers into memory ---

	// Cast destination pointer for byte-level stride arithmetic
	uint8_t dst_bytes = (uint8_t )dst;

	// Extract Row 0 data (elements 0..3) from the lower 128 bits of s32_low
	__m128i row0_data = _mm256_castsi256_si128(s32_low);
	// Store Row 0 (4x int32_t = 16 bytes) at the base destination address.
	// Using _mm_storeu_si128 for safety with potentially unaligned destinations.
	_mm_storeu_si128((__m128i )(dst_bytes + 0 stride_bytes), row0_data);

	// Extract Row 1 data (elements 4..7) from the upper 128 bits of s32_low
	__m128i row1_data = _mm256_extracti128_si256(s32_low, 1);
	// Store Row 1 (16 bytes) at the destination address + 1*stride.
	_mm_storeu_si128((__m128i )(dst_bytes + 1 stride_bytes), row1_data);

	// Extract Row 2 data (elements 8..11) from the lower 128 bits of s32_high
	__m128i row2_data = _mm256_castsi256_si128(s32_high);
	// Store Row 2 (16 bytes) at the destination address + 2*stride.
	_mm_storeu_si128((__m128i )(dst_bytes + 2 stride_bytes), row2_data);

	// Extract Row 3 data (elements 12..15) from the upper 128 bits of s32_high
	__m128i row3_data = _mm256_extracti128_si256(s32_high, 1);
	// Store Row 3 (16 bytes) at the destination address + 3*stride.
	_mm_storeu_si128((__m128i )(dst_bytes + 3 stride_bytes), row3_data);
	}

	void av1_lossless_fwd_idtx_avx2(const int16_t src_diff, tran_low_t coeff,
	int diff_stride, TxfmParam *txfm_param) {
	const int txw = tx_size_wide[txfm_param->tx_size];
	const int txh = tx_size_high[txfm_param->tx_size];
	int scale_bits = 3 - av1_get_tx_scale(txfm_param->tx_size);

	for (int i = 0; i < txh; i += MI_SIZE) {
	for (int j = 0; j < txw; j += MI_SIZE) {
	__m256i block =
	load_4x4_s16_avx2(src_diff + i * diff_stride + j, diff_stride);
	// Perform the left shift on the 16 packed int16_t elements
	// _mm256_slli_epi16 shifts zeros in from the right.
	__m256i shifted_block = _mm256_slli_epi16(block, scale_bits);
	store_4x4_s16_s32_avx2(shifted_block, coeff + i * txw + j, txw);
	}
	}
	}
	#endif // CONFIG_IMPROVE_LOSSLESS_TXM