aom_dsp/x86/highbd_quantize_intrin_sse2.c - aom - Git at Google

 /*
  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
  * was not distributed with this source code in the LICENSE file, you can
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */

 #include <emmintrin.h>

 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 #include "config/aom_dsp_rtcd.h"

 void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
                                 const int16_t *zbin_ptr,
                                 const int16_t *round_ptr,
                                 const int16_t *quant_ptr,
                                 const int16_t *quant_shift_ptr,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                 const int16_t *scan, const int16_t *iscan) {
   int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
   __m128i zbins[2];
   __m128i nzbins[2];

   zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
                            (int)zbin_ptr[0]);
   zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);

   nzbins[0] = _mm_setzero_si128();
   nzbins[1] = _mm_setzero_si128();
   nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);

   (void)scan;

   memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));

   // Pre-scan pass
   for (i = ((int)count / 4) - 1; i >= 0; i--) {
     __m128i coeffs, cmp1, cmp2;
     int test;
     coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
     cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
     cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
     cmp1 = _mm_and_si128(cmp1, cmp2);
     test = _mm_movemask_epi8(cmp1);
     if (test == 0xffff)
       non_zero_regs--;
     else
       break;
   }

   // Quantization pass:
   for (i = 0; i < non_zero_regs; i++) {
     __m128i coeffs, coeffs_sign, tmp1, tmp2;
     int test;
     int abs_coeff[4];
     int coeff_sign[4];

     coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
     coeffs_sign = _mm_srai_epi32(coeffs, 31);
     coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
     tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
     tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
     tmp1 = _mm_or_si128(tmp1, tmp2);
     test = _mm_movemask_epi8(tmp1);
     _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
     _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);

     for (j = 0; j < 4; j++) {
       if (test & (1 << (4 * j))) {
         int k = 4 * i + j;
         const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
         const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
         const uint32_t abs_qcoeff =
             (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
         qcoeff_ptr[k] =
             (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j];
         dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
         if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
       }
     }
   }
   *eob_ptr = eob_i + 1;
 }

 void aom_highbd_quantize_b_32x32_sse2(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
   __m128i zbins[2];
   __m128i nzbins[2];
   int idx = 0;
   int idx_arr[1024];
   int i, eob = -1;
   const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
   const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
   (void)scan;
   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
   zbins[1] = _mm_set1_epi32(zbin1_tmp);

   nzbins[0] = _mm_setzero_si128();
   nzbins[1] = _mm_setzero_si128();
   nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);

   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

   // Pre-scan pass
   for (i = 0; i < n_coeffs / 4; i++) {
     __m128i coeffs, cmp1, cmp2;
     int test;
     coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
     cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
     cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
     cmp1 = _mm_and_si128(cmp1, cmp2);
     test = _mm_movemask_epi8(cmp1);
     if (!(test & 0xf)) idx_arr[idx++] = i * 4;
     if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
     if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
     if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
   }

   // Quantization pass: only process the coefficients selected in
   // pre-scan pass. Note: idx can be zero.
   for (i = 0; i < idx; i++) {
     const int rc = idx_arr[i];
     const int coeff = coeff_ptr[rc];
     const int coeff_sign = AOMSIGN(coeff);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
     const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
     const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
     const uint32_t abs_qcoeff =
         (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
     qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
     if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
   }
   *eob_ptr = eob + 1;
 }

 void aom_highbd_quantize_b_64x64_sse2(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
   __m128i zbins[2];
   __m128i nzbins[2];
   int idx = 0;
   int idx_arr[1024];
   int i, eob = -1;
   const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 2);
   const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 2);
   (void)scan;
   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
   zbins[1] = _mm_set1_epi32(zbin1_tmp);

   nzbins[0] = _mm_setzero_si128();
   nzbins[1] = _mm_setzero_si128();
   nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);

   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

   // Pre-scan pass
   for (i = 0; i < n_coeffs / 4; i++) {
     __m128i coeffs, cmp1, cmp2;
     int test;
     coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
     cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
     cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
     cmp1 = _mm_and_si128(cmp1, cmp2);
     test = _mm_movemask_epi8(cmp1);
     if (!(test & 0xf)) idx_arr[idx++] = i * 4;
     if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
     if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
     if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
   }

   // Quantization pass: only process the coefficients selected in
   // pre-scan pass. Note: idx can be zero.
   for (i = 0; i < idx; i++) {
     const int rc = idx_arr[i];
     const int coeff = coeff_ptr[rc];
     const int coeff_sign = AOMSIGN(coeff);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
     const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
     const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
     const uint32_t abs_qcoeff =
         (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14);
     qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4;
     if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
   }
   *eob_ptr = eob + 1;
 }
	/*
	* Copyright (c) 2016, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 2 Clause License and
	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
	* was not distributed with this source code in the LICENSE file, you can
	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
	* Media Patent License 1.0 was not distributed with this source code in the
	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
	*/

	#include <emmintrin.h>

	#include "aom_dsp/aom_dsp_common.h"
	#include "aom_mem/aom_mem.h"
	#include "aom_ports/mem.h"
	#include "config/aom_dsp_rtcd.h"

	void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
	const int16_t *zbin_ptr,
	const int16_t *round_ptr,
	const int16_t *quant_ptr,
	const int16_t *quant_shift_ptr,
	tran_low_t qcoeff_ptr, tran_low_t dqcoeff_ptr,
	const int16_t dequant_ptr, uint16_t eob_ptr,
	const int16_t scan, const int16_t iscan) {
	int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
	__m128i zbins[2];
	__m128i nzbins[2];

	zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
	(int)zbin_ptr[0]);
	zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);

	nzbins[0] = _mm_setzero_si128();
	nzbins[1] = _mm_setzero_si128();
	nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
	nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);

	(void)scan;

	memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
	memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));

	// Pre-scan pass
	for (i = ((int)count / 4) - 1; i >= 0; i--) {
	__m128i coeffs, cmp1, cmp2;
	int test;
	coeffs = _mm_load_si128((const __m128i )(coeff_ptr + i 4));
	cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
	cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
	cmp1 = _mm_and_si128(cmp1, cmp2);
	test = _mm_movemask_epi8(cmp1);
	if (test == 0xffff)
	non_zero_regs--;
	else
	break;
	}

	// Quantization pass:
	for (i = 0; i < non_zero_regs; i++) {
	__m128i coeffs, coeffs_sign, tmp1, tmp2;
	int test;
	int abs_coeff[4];
	int coeff_sign[4];

	coeffs = _mm_load_si128((const __m128i )(coeff_ptr + i 4));
	coeffs_sign = _mm_srai_epi32(coeffs, 31);
	coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
	tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
	tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
	tmp1 = _mm_or_si128(tmp1, tmp2);
	test = _mm_movemask_epi8(tmp1);
	_mm_storeu_si128((__m128i *)abs_coeff, coeffs);
	_mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);

	for (j = 0; j < 4; j++) {
	if (test & (1 << (4 * j))) {
	int k = 4 * i + j;
	const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
	const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
	const uint32_t abs_qcoeff =
	(uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
	qcoeff_ptr[k] =
	(int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j];
	dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
	if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
	}
	}
	}
	*eob_ptr = eob_i + 1;
	}

	void aom_highbd_quantize_b_32x32_sse2(
	const tran_low_t coeff_ptr, intptr_t n_coeffs, const int16_t zbin_ptr,
	const int16_t round_ptr, const int16_t quant_ptr,
	const int16_t quant_shift_ptr, tran_low_t qcoeff_ptr,
	tran_low_t dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr,
	const int16_t scan, const int16_t iscan) {
	__m128i zbins[2];
	__m128i nzbins[2];
	int idx = 0;
	int idx_arr[1024];
	int i, eob = -1;
	const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
	const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
	(void)scan;
	zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
	zbins[1] = _mm_set1_epi32(zbin1_tmp);

	nzbins[0] = _mm_setzero_si128();
	nzbins[1] = _mm_setzero_si128();
	nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
	nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);

	memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
	memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

	// Pre-scan pass
	for (i = 0; i < n_coeffs / 4; i++) {
	__m128i coeffs, cmp1, cmp2;
	int test;
	coeffs = _mm_load_si128((const __m128i )(coeff_ptr + i 4));
	cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
	cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
	cmp1 = _mm_and_si128(cmp1, cmp2);
	test = _mm_movemask_epi8(cmp1);
	if (!(test & 0xf)) idx_arr[idx++] = i * 4;
	if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
	if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
	if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
	}

	// Quantization pass: only process the coefficients selected in
	// pre-scan pass. Note: idx can be zero.
	for (i = 0; i < idx; i++) {
	const int rc = idx_arr[i];
	const int coeff = coeff_ptr[rc];
	const int coeff_sign = AOMSIGN(coeff);
	const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
	const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
	const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
	const uint32_t abs_qcoeff =
	(uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
	qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
	dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
	if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
	}
	*eob_ptr = eob + 1;
	}

	void aom_highbd_quantize_b_64x64_sse2(
	const tran_low_t coeff_ptr, intptr_t n_coeffs, const int16_t zbin_ptr,
	const int16_t round_ptr, const int16_t quant_ptr,
	const int16_t quant_shift_ptr, tran_low_t qcoeff_ptr,
	tran_low_t dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr,
	const int16_t scan, const int16_t iscan) {
	__m128i zbins[2];
	__m128i nzbins[2];
	int idx = 0;
	int idx_arr[1024];
	int i, eob = -1;
	const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 2);
	const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 2);
	(void)scan;
	zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
	zbins[1] = _mm_set1_epi32(zbin1_tmp);

	nzbins[0] = _mm_setzero_si128();
	nzbins[1] = _mm_setzero_si128();
	nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
	nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);

	memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
	memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

	// Pre-scan pass
	for (i = 0; i < n_coeffs / 4; i++) {
	__m128i coeffs, cmp1, cmp2;
	int test;
	coeffs = _mm_load_si128((const __m128i )(coeff_ptr + i 4));
	cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
	cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
	cmp1 = _mm_and_si128(cmp1, cmp2);
	test = _mm_movemask_epi8(cmp1);
	if (!(test & 0xf)) idx_arr[idx++] = i * 4;
	if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
	if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
	if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
	}

	// Quantization pass: only process the coefficients selected in
	// pre-scan pass. Note: idx can be zero.
	for (i = 0; i < idx; i++) {
	const int rc = idx_arr[i];
	const int coeff = coeff_ptr[rc];
	const int coeff_sign = AOMSIGN(coeff);
	const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
	const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
	const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
	const uint32_t abs_qcoeff =
	(uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14);
	qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
	dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4;
	if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
	}
	*eob_ptr = eob + 1;
	}