Remove unused copies of transform related source code - Library size reduces: 165 kB, 292 kB (HBD). Change-Id: I50cb630dde326bd2a28c0db4b7e2d53c2fd94a2a
diff --git a/av1/av1_common.mk b/av1/av1_common.mk index 43b76ad..4541dd0 100644 --- a/av1/av1_common.mk +++ b/av1/av1_common.mk
@@ -30,8 +30,6 @@ AV1_COMMON_SRCS-yes += common/filter.c AV1_COMMON_SRCS-yes += common/idct.h AV1_COMMON_SRCS-yes += common/idct.c -AV1_COMMON_SRCS-yes += common/av1_inv_txfm.h -AV1_COMMON_SRCS-yes += common/av1_inv_txfm.c AV1_COMMON_SRCS-yes += common/loopfilter.h AV1_COMMON_SRCS-yes += common/thread_common.h AV1_COMMON_SRCS-yes += common/mv.h @@ -61,8 +59,6 @@ AV1_COMMON_SRCS-yes += common/scan.c AV1_COMMON_SRCS-yes += common/scan.h # TODO(angiebird) the forward transform belongs under encoder/ -AV1_COMMON_SRCS-$(CONFIG_AV1_ENCODER) += common/av1_fwd_txfm.h -AV1_COMMON_SRCS-$(CONFIG_AV1_ENCODER) += common/av1_fwd_txfm.c AV1_COMMON_SRCS-yes += common/av1_txfm.h AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.h AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.c @@ -123,9 +119,6 @@ AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_intrin_sse2.c ifeq ($(CONFIG_AV1_ENCODER),yes) -AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_txfm_sse2.c -AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_dct32x32_impl_sse2.h -AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_txfm_impl_sse2.h AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_txfm1d_sse4.h AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_fwd_txfm1d_sse4.c AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_fwd_txfm2d_sse4.c @@ -143,7 +136,4 @@ AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/filterintra_sse4.c endif -AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_inv_txfm_sse2.c -AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_inv_txfm_sse2.h - $(eval $(call rtcd_h_template,av1_rtcd,av1/common/av1_rtcd_defs.pl))
diff --git a/av1/common/av1_fwd_txfm.c b/av1/common/av1_fwd_txfm.c deleted file mode 100644 index 84a3876..0000000 --- a/av1/common/av1_fwd_txfm.c +++ /dev/null
@@ -1,813 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "av1/common/av1_fwd_txfm.h" -#include <assert.h> -#include "./av1_rtcd.h" - -void av1_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { - // The 2D transform is done with two passes which are actually pretty - // similar. In the first one, we transform the columns and transpose - // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we transpose the columns (that - // is the transposed rows) and transpose the results (so that it goes back - // in normal/row positions). - int pass; - // We need an intermediate buffer between passes. - tran_low_t intermediate[4 * 4]; - const tran_low_t *in_low = NULL; - tran_low_t *out = intermediate; - // Do the two transform/transpose passes - for (pass = 0; pass < 2; ++pass) { - tran_high_t in_high[4]; // canbe16 - tran_high_t step[4]; // canbe16 - tran_high_t temp1, temp2; // needs32 - int i; - for (i = 0; i < 4; ++i) { - // Load inputs. - if (0 == pass) { - in_high[0] = input[0 * stride] * 16; - in_high[1] = input[1 * stride] * 16; - in_high[2] = input[2 * stride] * 16; - in_high[3] = input[3 * stride] * 16; - if (i == 0 && in_high[0]) { - in_high[0] += 1; - } - } else { - assert(in_low != NULL); - in_high[0] = in_low[0 * 4]; - in_high[1] = in_low[1 * 4]; - in_high[2] = in_low[2 * 4]; - in_high[3] = in_low[3 * 4]; - in_low++; - } - // Transform. - step[0] = in_high[0] + in_high[3]; - step[1] = in_high[1] + in_high[2]; - step[2] = in_high[1] - in_high[2]; - step[3] = in_high[0] - in_high[3]; - temp1 = (step[0] + step[1]) * cospi_16_64; - temp2 = (step[0] - step[1]) * cospi_16_64; - out[0] = (tran_low_t)fdct_round_shift(temp1); - out[2] = (tran_low_t)fdct_round_shift(temp2); - temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; - temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; - out[1] = (tran_low_t)fdct_round_shift(temp1); - out[3] = (tran_low_t)fdct_round_shift(temp2); - // Do next column (which is a transposed row in second/horizontal pass) - input++; - out += 4; - } - // Setup in_low/out for next pass. - in_low = intermediate; - out = output; - } - - { - int i, j; - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2; - } - } -} - -void av1_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { - int r, c; - tran_low_t sum = 0; - for (r = 0; r < 4; ++r) - for (c = 0; c < 4; ++c) sum += input[r * stride + c]; - - output[0] = sum << 1; - output[1] = 0; -} - -void av1_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { - int i, j; - tran_low_t intermediate[64]; - int pass; - tran_low_t *output = intermediate; - const tran_low_t *in = NULL; - - // Transform columns - for (pass = 0; pass < 2; ++pass) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 - tran_high_t t0, t1, t2, t3; // needs32 - tran_high_t x0, x1, x2, x3; // canbe16 - - for (i = 0; i < 8; i++) { - // stage 1 - if (pass == 0) { - s0 = (input[0 * stride] + input[7 * stride]) * 4; - s1 = (input[1 * stride] + input[6 * stride]) * 4; - s2 = (input[2 * stride] + input[5 * stride]) * 4; - s3 = (input[3 * stride] + input[4 * stride]) * 4; - s4 = (input[3 * stride] - input[4 * stride]) * 4; - s5 = (input[2 * stride] - input[5 * stride]) * 4; - s6 = (input[1 * stride] - input[6 * stride]) * 4; - s7 = (input[0 * stride] - input[7 * stride]) * 4; - ++input; - } else { - s0 = in[0 * 8] + in[7 * 8]; - s1 = in[1 * 8] + in[6 * 8]; - s2 = in[2 * 8] + in[5 * 8]; - s3 = in[3 * 8] + in[4 * 8]; - s4 = in[3 * 8] - in[4 * 8]; - s5 = in[2 * 8] - in[5 * 8]; - s6 = in[1 * 8] - in[6 * 8]; - s7 = in[0 * 8] - in[7 * 8]; - ++in; - } - - // fdct4(step, step); - x0 = s0 + s3; - x1 = s1 + s2; - x2 = s1 - s2; - x3 = s0 - s3; - t0 = (x0 + x1) * cospi_16_64; - t1 = (x0 - x1) * cospi_16_64; - t2 = x2 * cospi_24_64 + x3 * cospi_8_64; - t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; - output[0] = (tran_low_t)fdct_round_shift(t0); - output[2] = (tran_low_t)fdct_round_shift(t2); - output[4] = (tran_low_t)fdct_round_shift(t1); - output[6] = (tran_low_t)fdct_round_shift(t3); - - // Stage 2 - t0 = (s6 - s5) * cospi_16_64; - t1 = (s6 + s5) * cospi_16_64; - t2 = fdct_round_shift(t0); - t3 = fdct_round_shift(t1); - - // Stage 3 - x0 = s4 + t2; - x1 = s4 - t2; - x2 = s7 - t3; - x3 = s7 + t3; - - // Stage 4 - t0 = x0 * cospi_28_64 + x3 * cospi_4_64; - t1 = x1 * cospi_12_64 + x2 * cospi_20_64; - t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; - t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; - output[1] = (tran_low_t)fdct_round_shift(t0); - output[3] = (tran_low_t)fdct_round_shift(t2); - output[5] = (tran_low_t)fdct_round_shift(t1); - output[7] = (tran_low_t)fdct_round_shift(t3); - output += 8; - } - in = intermediate; - output = final_output; - } - - // Rows - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2; - } -} - -void av1_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) { - int r, c; - tran_low_t sum = 0; - for (r = 0; r < 8; ++r) - for (c = 0; c < 8; ++c) sum += input[r * stride + c]; - - output[0] = sum; - output[1] = 0; -} - -void av1_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { - // The 2D transform is done with two passes which are actually pretty - // similar. In the first one, we transform the columns and transpose - // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we transpose the columns (that - // is the transposed rows) and transpose the results (so that it goes back - // in normal/row positions). - int pass; - // We need an intermediate buffer between passes. - tran_low_t intermediate[256]; - const tran_low_t *in_low = NULL; - tran_low_t *out = intermediate; - // Do the two transform/transpose passes - for (pass = 0; pass < 2; ++pass) { - tran_high_t step1[8]; // canbe16 - tran_high_t step2[8]; // canbe16 - tran_high_t step3[8]; // canbe16 - tran_high_t in_high[8]; // canbe16 - tran_high_t temp1, temp2; // needs32 - int i; - for (i = 0; i < 16; i++) { - if (0 == pass) { - // Calculate input for the first 8 results. - in_high[0] = (input[0 * stride] + input[15 * stride]) * 4; - in_high[1] = (input[1 * stride] + input[14 * stride]) * 4; - in_high[2] = (input[2 * stride] + input[13 * stride]) * 4; - in_high[3] = (input[3 * stride] + input[12 * stride]) * 4; - in_high[4] = (input[4 * stride] + input[11 * stride]) * 4; - in_high[5] = (input[5 * stride] + input[10 * stride]) * 4; - in_high[6] = (input[6 * stride] + input[9 * stride]) * 4; - in_high[7] = (input[7 * stride] + input[8 * stride]) * 4; - // Calculate input for the next 8 results. - step1[0] = (input[7 * stride] - input[8 * stride]) * 4; - step1[1] = (input[6 * stride] - input[9 * stride]) * 4; - step1[2] = (input[5 * stride] - input[10 * stride]) * 4; - step1[3] = (input[4 * stride] - input[11 * stride]) * 4; - step1[4] = (input[3 * stride] - input[12 * stride]) * 4; - step1[5] = (input[2 * stride] - input[13 * stride]) * 4; - step1[6] = (input[1 * stride] - input[14 * stride]) * 4; - step1[7] = (input[0 * stride] - input[15 * stride]) * 4; - } else { - // Calculate input for the first 8 results. - assert(in_low != NULL); - in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2); - in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2); - in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2); - in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2); - in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2); - in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2); - in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2); - in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2); - // Calculate input for the next 8 results. - step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2); - step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2); - step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2); - step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2); - step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2); - step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2); - step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2); - step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2); - in_low++; - } - // Work on the first eight values; fdct8(input, even_results); - { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 - tran_high_t t0, t1, t2, t3; // needs32 - tran_high_t x0, x1, x2, x3; // canbe16 - - // stage 1 - s0 = in_high[0] + in_high[7]; - s1 = in_high[1] + in_high[6]; - s2 = in_high[2] + in_high[5]; - s3 = in_high[3] + in_high[4]; - s4 = in_high[3] - in_high[4]; - s5 = in_high[2] - in_high[5]; - s6 = in_high[1] - in_high[6]; - s7 = in_high[0] - in_high[7]; - - // fdct4(step, step); - x0 = s0 + s3; - x1 = s1 + s2; - x2 = s1 - s2; - x3 = s0 - s3; - t0 = (x0 + x1) * cospi_16_64; - t1 = (x0 - x1) * cospi_16_64; - t2 = x3 * cospi_8_64 + x2 * cospi_24_64; - t3 = x3 * cospi_24_64 - x2 * cospi_8_64; - out[0] = (tran_low_t)fdct_round_shift(t0); - out[4] = (tran_low_t)fdct_round_shift(t2); - out[8] = (tran_low_t)fdct_round_shift(t1); - out[12] = (tran_low_t)fdct_round_shift(t3); - - // Stage 2 - t0 = (s6 - s5) * cospi_16_64; - t1 = (s6 + s5) * cospi_16_64; - t2 = fdct_round_shift(t0); - t3 = fdct_round_shift(t1); - - // Stage 3 - x0 = s4 + t2; - x1 = s4 - t2; - x2 = s7 - t3; - x3 = s7 + t3; - - // Stage 4 - t0 = x0 * cospi_28_64 + x3 * cospi_4_64; - t1 = x1 * cospi_12_64 + x2 * cospi_20_64; - t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; - t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; - out[2] = (tran_low_t)fdct_round_shift(t0); - out[6] = (tran_low_t)fdct_round_shift(t2); - out[10] = (tran_low_t)fdct_round_shift(t1); - out[14] = (tran_low_t)fdct_round_shift(t3); - } - // Work on the next eight values; step1 -> odd_results - { - // step 2 - temp1 = (step1[5] - step1[2]) * cospi_16_64; - temp2 = (step1[4] - step1[3]) * cospi_16_64; - step2[2] = fdct_round_shift(temp1); - step2[3] = fdct_round_shift(temp2); - temp1 = (step1[4] + step1[3]) * cospi_16_64; - temp2 = (step1[5] + step1[2]) * cospi_16_64; - step2[4] = fdct_round_shift(temp1); - step2[5] = fdct_round_shift(temp2); - // step 3 - step3[0] = step1[0] + step2[3]; - step3[1] = step1[1] + step2[2]; - step3[2] = step1[1] - step2[2]; - step3[3] = step1[0] - step2[3]; - step3[4] = step1[7] - step2[4]; - step3[5] = step1[6] - step2[5]; - step3[6] = step1[6] + step2[5]; - step3[7] = step1[7] + step2[4]; - // step 4 - temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; - temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; - step2[1] = fdct_round_shift(temp1); - step2[2] = fdct_round_shift(temp2); - temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64; - temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; - step2[5] = fdct_round_shift(temp1); - step2[6] = fdct_round_shift(temp2); - // step 5 - step1[0] = step3[0] + step2[1]; - step1[1] = step3[0] - step2[1]; - step1[2] = step3[3] + step2[2]; - step1[3] = step3[3] - step2[2]; - step1[4] = step3[4] - step2[5]; - step1[5] = step3[4] + step2[5]; - step1[6] = step3[7] - step2[6]; - step1[7] = step3[7] + step2[6]; - // step 6 - temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; - temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; - out[1] = (tran_low_t)fdct_round_shift(temp1); - out[9] = (tran_low_t)fdct_round_shift(temp2); - temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; - temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; - out[5] = (tran_low_t)fdct_round_shift(temp1); - out[13] = (tran_low_t)fdct_round_shift(temp2); - temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; - temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; - out[3] = (tran_low_t)fdct_round_shift(temp1); - out[11] = (tran_low_t)fdct_round_shift(temp2); - temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; - temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; - out[7] = (tran_low_t)fdct_round_shift(temp1); - out[15] = (tran_low_t)fdct_round_shift(temp2); - } - // Do next column (which is a transposed row in second/horizontal pass) - input++; - out += 16; - } - // Setup in/out for next pass. - in_low = intermediate; - out = output; - } -} - -void av1_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) { - int r, c; - tran_low_t sum = 0; - for (r = 0; r < 16; ++r) - for (c = 0; c < 16; ++c) sum += input[r * stride + c]; - - output[0] = sum >> 1; - output[1] = 0; -} - -static INLINE tran_high_t dct_32_round(tran_high_t input) { - tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - // TODO(debargha, peter.derivaz): Find new bounds for this assert, - // and make the bounds consts. - // assert(-131072 <= rv && rv <= 131071); - return rv; -} - -static INLINE tran_high_t half_round_shift(tran_high_t input) { - tran_high_t rv = (input + 1 + (input < 0)) >> 2; - return rv; -} - -void av1_fdct32(const tran_high_t *input, tran_high_t *output, int round) { - tran_high_t step[32]; - // Stage 1 - step[0] = input[0] + input[(32 - 1)]; - step[1] = input[1] + input[(32 - 2)]; - step[2] = input[2] + input[(32 - 3)]; - step[3] = input[3] + input[(32 - 4)]; - step[4] = input[4] + input[(32 - 5)]; - step[5] = input[5] + input[(32 - 6)]; - step[6] = input[6] + input[(32 - 7)]; - step[7] = input[7] + input[(32 - 8)]; - step[8] = input[8] + input[(32 - 9)]; - step[9] = input[9] + input[(32 - 10)]; - step[10] = input[10] + input[(32 - 11)]; - step[11] = input[11] + input[(32 - 12)]; - step[12] = input[12] + input[(32 - 13)]; - step[13] = input[13] + input[(32 - 14)]; - step[14] = input[14] + input[(32 - 15)]; - step[15] = input[15] + input[(32 - 16)]; - step[16] = -input[16] + input[(32 - 17)]; - step[17] = -input[17] + input[(32 - 18)]; - step[18] = -input[18] + input[(32 - 19)]; - step[19] = -input[19] + input[(32 - 20)]; - step[20] = -input[20] + input[(32 - 21)]; - step[21] = -input[21] + input[(32 - 22)]; - step[22] = -input[22] + input[(32 - 23)]; - step[23] = -input[23] + input[(32 - 24)]; - step[24] = -input[24] + input[(32 - 25)]; - step[25] = -input[25] + input[(32 - 26)]; - step[26] = -input[26] + input[(32 - 27)]; - step[27] = -input[27] + input[(32 - 28)]; - step[28] = -input[28] + input[(32 - 29)]; - step[29] = -input[29] + input[(32 - 30)]; - step[30] = -input[30] + input[(32 - 31)]; - step[31] = -input[31] + input[(32 - 32)]; - - // Stage 2 - output[0] = step[0] + step[16 - 1]; - output[1] = step[1] + step[16 - 2]; - output[2] = step[2] + step[16 - 3]; - output[3] = step[3] + step[16 - 4]; - output[4] = step[4] + step[16 - 5]; - output[5] = step[5] + step[16 - 6]; - output[6] = step[6] + step[16 - 7]; - output[7] = step[7] + step[16 - 8]; - output[8] = -step[8] + step[16 - 9]; - output[9] = -step[9] + step[16 - 10]; - output[10] = -step[10] + step[16 - 11]; - output[11] = -step[11] + step[16 - 12]; - output[12] = -step[12] + step[16 - 13]; - output[13] = -step[13] + step[16 - 14]; - output[14] = -step[14] + step[16 - 15]; - output[15] = -step[15] + step[16 - 16]; - - output[16] = step[16]; - output[17] = step[17]; - output[18] = step[18]; - output[19] = step[19]; - - output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64); - output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64); - output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64); - output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64); - - output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64); - output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64); - output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64); - output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64); - - output[28] = step[28]; - output[29] = step[29]; - output[30] = step[30]; - output[31] = step[31]; - - // dump the magnitude by 4, hence the intermediate values are within - // the range of 16 bits. - if (round) { - output[0] = half_round_shift(output[0]); - output[1] = half_round_shift(output[1]); - output[2] = half_round_shift(output[2]); - output[3] = half_round_shift(output[3]); - output[4] = half_round_shift(output[4]); - output[5] = half_round_shift(output[5]); - output[6] = half_round_shift(output[6]); - output[7] = half_round_shift(output[7]); - output[8] = half_round_shift(output[8]); - output[9] = half_round_shift(output[9]); - output[10] = half_round_shift(output[10]); - output[11] = half_round_shift(output[11]); - output[12] = half_round_shift(output[12]); - output[13] = half_round_shift(output[13]); - output[14] = half_round_shift(output[14]); - output[15] = half_round_shift(output[15]); - - output[16] = half_round_shift(output[16]); - output[17] = half_round_shift(output[17]); - output[18] = half_round_shift(output[18]); - output[19] = half_round_shift(output[19]); - output[20] = half_round_shift(output[20]); - output[21] = half_round_shift(output[21]); - output[22] = half_round_shift(output[22]); - output[23] = half_round_shift(output[23]); - output[24] = half_round_shift(output[24]); - output[25] = half_round_shift(output[25]); - output[26] = half_round_shift(output[26]); - output[27] = half_round_shift(output[27]); - output[28] = half_round_shift(output[28]); - output[29] = half_round_shift(output[29]); - output[30] = half_round_shift(output[30]); - output[31] = half_round_shift(output[31]); - } - - // Stage 3 - step[0] = output[0] + output[(8 - 1)]; - step[1] = output[1] + output[(8 - 2)]; - step[2] = output[2] + output[(8 - 3)]; - step[3] = output[3] + output[(8 - 4)]; - step[4] = -output[4] + output[(8 - 5)]; - step[5] = -output[5] + output[(8 - 6)]; - step[6] = -output[6] + output[(8 - 7)]; - step[7] = -output[7] + output[(8 - 8)]; - step[8] = output[8]; - step[9] = output[9]; - step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64); - step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64); - step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64); - step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64); - step[14] = output[14]; - step[15] = output[15]; - - step[16] = output[16] + output[23]; - step[17] = output[17] + output[22]; - step[18] = output[18] + output[21]; - step[19] = output[19] + output[20]; - step[20] = -output[20] + output[19]; - step[21] = -output[21] + output[18]; - step[22] = -output[22] + output[17]; - step[23] = -output[23] + output[16]; - step[24] = -output[24] + output[31]; - step[25] = -output[25] + output[30]; - step[26] = -output[26] + output[29]; - step[27] = -output[27] + output[28]; - step[28] = output[28] + output[27]; - step[29] = output[29] + output[26]; - step[30] = output[30] + output[25]; - step[31] = output[31] + output[24]; - - // Stage 4 - output[0] = step[0] + step[3]; - output[1] = step[1] + step[2]; - output[2] = -step[2] + step[1]; - output[3] = -step[3] + step[0]; - output[4] = step[4]; - output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64); - output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64); - output[7] = step[7]; - output[8] = step[8] + step[11]; - output[9] = step[9] + step[10]; - output[10] = -step[10] + step[9]; - output[11] = -step[11] + step[8]; - output[12] = -step[12] + step[15]; - output[13] = -step[13] + step[14]; - output[14] = step[14] + step[13]; - output[15] = step[15] + step[12]; - - output[16] = step[16]; - output[17] = step[17]; - output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64); - output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64); - output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64); - output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64); - output[22] = step[22]; - output[23] = step[23]; - output[24] = step[24]; - output[25] = step[25]; - output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64); - output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64); - output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64); - output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64); - output[30] = step[30]; - output[31] = step[31]; - - // Stage 5 - step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64); - step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64); - step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64); - step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64); - step[4] = output[4] + output[5]; - step[5] = -output[5] + output[4]; - step[6] = -output[6] + output[7]; - step[7] = output[7] + output[6]; - step[8] = output[8]; - step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64); - step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64); - step[11] = output[11]; - step[12] = output[12]; - step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64); - step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64); - step[15] = output[15]; - - step[16] = output[16] + output[19]; - step[17] = output[17] + output[18]; - step[18] = -output[18] + output[17]; - step[19] = -output[19] + output[16]; - step[20] = -output[20] + output[23]; - step[21] = -output[21] + output[22]; - step[22] = output[22] + output[21]; - step[23] = output[23] + output[20]; - step[24] = output[24] + output[27]; - step[25] = output[25] + output[26]; - step[26] = -output[26] + output[25]; - step[27] = -output[27] + output[24]; - step[28] = -output[28] + output[31]; - step[29] = -output[29] + output[30]; - step[30] = output[30] + output[29]; - step[31] = output[31] + output[28]; - - // Stage 6 - output[0] = step[0]; - output[1] = step[1]; - output[2] = step[2]; - output[3] = step[3]; - output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64); - output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64); - output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64); - output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64); - output[8] = step[8] + step[9]; - output[9] = -step[9] + step[8]; - output[10] = -step[10] + step[11]; - output[11] = step[11] + step[10]; - output[12] = step[12] + step[13]; - output[13] = -step[13] + step[12]; - output[14] = -step[14] + step[15]; - output[15] = step[15] + step[14]; - - output[16] = step[16]; - output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64); - output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64); - output[19] = step[19]; - output[20] = step[20]; - output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64); - output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64); - output[23] = step[23]; - output[24] = step[24]; - output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64); - output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64); - output[27] = step[27]; - output[28] = step[28]; - output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64); - output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64); - output[31] = step[31]; - - // Stage 7 - step[0] = output[0]; - step[1] = output[1]; - step[2] = output[2]; - step[3] = output[3]; - step[4] = output[4]; - step[5] = output[5]; - step[6] = output[6]; - step[7] = output[7]; - step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64); - step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64); - step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64); - step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64); - step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64); - step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64); - step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64); - step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64); - - step[16] = output[16] + output[17]; - step[17] = -output[17] + output[16]; - step[18] = -output[18] + output[19]; - step[19] = output[19] + output[18]; - step[20] = output[20] + output[21]; - step[21] = -output[21] + output[20]; - step[22] = -output[22] + output[23]; - step[23] = output[23] + output[22]; - step[24] = output[24] + output[25]; - step[25] = -output[25] + output[24]; - step[26] = -output[26] + output[27]; - step[27] = output[27] + output[26]; - step[28] = output[28] + output[29]; - step[29] = -output[29] + output[28]; - step[30] = -output[30] + output[31]; - step[31] = output[31] + output[30]; - - // Final stage --- outputs indices are bit-reversed. - output[0] = step[0]; - output[16] = step[1]; - output[8] = step[2]; - output[24] = step[3]; - output[4] = step[4]; - output[20] = step[5]; - output[12] = step[6]; - output[28] = step[7]; - output[2] = step[8]; - output[18] = step[9]; - output[10] = step[10]; - output[26] = step[11]; - output[6] = step[12]; - output[22] = step[13]; - output[14] = step[14]; - output[30] = step[15]; - - output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64); - output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64); - output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64); - output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64); - output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64); - output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64); - output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64); - output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64); - output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); - output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); - output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); - output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); - output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); - output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); - output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); - output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); -} - -void av1_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { - int i, j; - tran_high_t output[32 * 32]; - - // Columns - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; - av1_fdct32(temp_in, temp_out, 0); - for (j = 0; j < 32; ++j) - output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; - } - - // Rows - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; - av1_fdct32(temp_in, temp_out, 0); - for (j = 0; j < 32; ++j) - out[j + i * 32] = - (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); - } -} - -// Note that although we use dct_32_round in dct32 computation flow, -// this 2d fdct32x32 for rate-distortion optimization loop is operating -// within 16 bits precision. -void av1_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { - int i, j; - tran_high_t output[32 * 32]; - - // Columns - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; - av1_fdct32(temp_in, temp_out, 0); - for (j = 0; j < 32; ++j) - // TODO(cd): see quality impact of only doing - // output[j * 32 + i] = (temp_out[j] + 1) >> 2; - // PS: also change code in av1_dsp/x86/av1_dct_sse2.c - output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; - } - - // Rows - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; - av1_fdct32(temp_in, temp_out, 1); - for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j]; - } -} - -void av1_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { - int r, c; - tran_low_t sum = 0; - for (r = 0; r < 32; ++r) - for (c = 0; c < 32; ++c) sum += input[r * stride + c]; - - output[0] = sum >> 3; - output[1] = 0; -} - -#if CONFIG_AOM_HIGHBITDEPTH -void av1_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, - int stride) { - av1_fdct4x4_c(input, output, stride); -} - -void av1_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output, - int stride) { - av1_fdct8x8_c(input, final_output, stride); -} - -void av1_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output, - int stride) { - av1_fdct8x8_1_c(input, final_output, stride); -} - -void av1_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, - int stride) { - av1_fdct16x16_c(input, output, stride); -} - -void av1_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, - int stride) { - av1_fdct16x16_1_c(input, output, stride); -} - -void av1_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { - av1_fdct32x32_c(input, out, stride); -} - -void av1_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, - int stride) { - av1_fdct32x32_rd_c(input, out, stride); -} - -void av1_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out, - int stride) { - av1_fdct32x32_1_c(input, out, stride); -} -#endif // CONFIG_AOM_HIGHBITDEPTH
diff --git a/av1/common/av1_fwd_txfm.h b/av1/common/av1_fwd_txfm.h deleted file mode 100644 index db763e5..0000000 --- a/av1/common/av1_fwd_txfm.h +++ /dev/null
@@ -1,19 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AV1_COMMON_AV1_FWD_TXFM_H_ -#define AV1_COMMON_AV1_FWD_TXFM_H_ - -#include "aom_dsp/txfm_common.h" -#include "aom_dsp/fwd_txfm.h" - -void av1_fdct32(const tran_high_t *input, tran_high_t *output, int round); -#endif // AV1_COMMON_AV1_FWD_TXFM_H_
diff --git a/av1/common/av1_inv_txfm.c b/av1/common/av1_inv_txfm.c deleted file mode 100644 index 4b2f061..0000000 --- a/av1/common/av1_inv_txfm.c +++ /dev/null
@@ -1,2468 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <math.h> -#include <string.h> - -#include "./av1_rtcd.h" -#include "av1/common/av1_inv_txfm.h" - -void av1_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, - 0.5 shifts per pixel. */ - int i; - tran_low_t output[16]; - tran_high_t a1, b1, c1, d1, e1; - const tran_low_t *ip = input; - tran_low_t *op = output; - - for (i = 0; i < 4; i++) { - a1 = ip[0] >> UNIT_QUANT_SHIFT; - c1 = ip[1] >> UNIT_QUANT_SHIFT; - d1 = ip[2] >> UNIT_QUANT_SHIFT; - b1 = ip[3] >> UNIT_QUANT_SHIFT; - a1 += c1; - d1 -= b1; - e1 = (a1 - d1) >> 1; - b1 = e1 - b1; - c1 = e1 - c1; - a1 -= b1; - d1 += c1; - op[0] = WRAPLOW(a1); - op[1] = WRAPLOW(b1); - op[2] = WRAPLOW(c1); - op[3] = WRAPLOW(d1); - ip += 4; - op += 4; - } - - ip = output; - for (i = 0; i < 4; i++) { - a1 = ip[4 * 0]; - c1 = ip[4 * 1]; - d1 = ip[4 * 2]; - b1 = ip[4 * 3]; - a1 += c1; - d1 -= b1; - e1 = (a1 - d1) >> 1; - b1 = e1 - b1; - c1 = e1 - c1; - a1 -= b1; - d1 += c1; - dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1)); - dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1)); - dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1)); - dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1)); - - ip++; - dest++; - } -} - -void av1_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { - int i; - tran_high_t a1, e1; - tran_low_t tmp[4]; - const tran_low_t *ip = in; - tran_low_t *op = tmp; - - a1 = ip[0] >> UNIT_QUANT_SHIFT; - e1 = a1 >> 1; - a1 -= e1; - op[0] = WRAPLOW(a1); - op[1] = op[2] = op[3] = WRAPLOW(e1); - - ip = tmp; - for (i = 0; i < 4; i++) { - e1 = ip[0] >> 1; - a1 = ip[0] - e1; - dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1); - dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1); - dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1); - dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1); - ip++; - dest++; - } -} - -void av1_idct4_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step[4]; - tran_high_t temp1, temp2; - // stage 1 - temp1 = (input[0] + input[2]) * cospi_16_64; - temp2 = (input[0] - input[2]) * cospi_16_64; - step[0] = WRAPLOW(dct_const_round_shift(temp1)); - step[1] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step[2] = WRAPLOW(dct_const_round_shift(temp1)); - step[3] = WRAPLOW(dct_const_round_shift(temp2)); - - // stage 2 - output[0] = WRAPLOW(step[0] + step[3]); - output[1] = WRAPLOW(step[1] + step[2]); - output[2] = WRAPLOW(step[1] - step[2]); - output[3] = WRAPLOW(step[0] - step[3]); -} - -void av1_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[4], temp_out[4]; - - // Rows - for (i = 0; i < 4; ++i) { - av1_idct4_c(input, outptr); - input += 4; - outptr += 4; - } - - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - av1_idct4_c(temp_in, temp_out); - for (j = 0; j < 4; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 4)); - } - } -} - -void av1_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, - int dest_stride) { - int i; - tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); - a1 = ROUND_POWER_OF_TWO(out, 4); - - for (i = 0; i < 4; i++) { - dest[0] = clip_pixel_add(dest[0], a1); - dest[1] = clip_pixel_add(dest[1], a1); - dest[2] = clip_pixel_add(dest[2], a1); - dest[3] = clip_pixel_add(dest[3], a1); - dest += dest_stride; - } -} - -void av1_idct8_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step1[8], step2[8]; - tran_high_t temp1, temp2; - // stage 1 - step1[0] = input[0]; - step1[2] = input[4]; - step1[1] = input[2]; - step1[3] = input[6]; - temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; - temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1)); - step1[7] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; - temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); - - // stage 2 - temp1 = (step1[0] + step1[2]) * cospi_16_64; - temp2 = (step1[0] - step1[2]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1)); - step2[1] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1)); - step2[3] = WRAPLOW(dct_const_round_shift(temp2)); - step2[4] = WRAPLOW(step1[4] + step1[5]); - step2[5] = WRAPLOW(step1[4] - step1[5]); - step2[6] = WRAPLOW(-step1[6] + step1[7]); - step2[7] = WRAPLOW(step1[6] + step1[7]); - - // stage 3 - step1[0] = WRAPLOW(step2[0] + step2[3]); - step1[1] = WRAPLOW(step2[1] + step2[2]); - step1[2] = WRAPLOW(step2[1] - step2[2]); - step1[3] = WRAPLOW(step2[0] - step2[3]); - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); - step1[7] = step2[7]; - - // stage 4 - output[0] = WRAPLOW(step1[0] + step1[7]); - output[1] = WRAPLOW(step1[1] + step1[6]); - output[2] = WRAPLOW(step1[2] + step1[5]); - output[3] = WRAPLOW(step1[3] + step1[4]); - output[4] = WRAPLOW(step1[3] - step1[4]); - output[5] = WRAPLOW(step1[2] - step1[5]); - output[6] = WRAPLOW(step1[1] - step1[6]); - output[7] = WRAPLOW(step1[0] - step1[7]); -} - -void av1_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - tran_low_t out[8 * 8]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[8], temp_out[8]; - - // First transform rows - for (i = 0; i < 8; ++i) { - av1_idct8_c(input, outptr); - input += 8; - outptr += 8; - } - - // Then transform columns - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - av1_idct8_c(temp_in, temp_out); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 5)); - } - } -} - -void av1_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - int i, j; - tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); - a1 = ROUND_POWER_OF_TWO(out, 5); - for (j = 0; j < 8; ++j) { - for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1); - dest += stride; - } -} - -void av1_iadst4_c(const tran_low_t *input, tran_low_t *output) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - - tran_low_t x0 = input[0]; - tran_low_t x1 = input[1]; - tran_low_t x2 = input[2]; - tran_low_t x3 = input[3]; - - if (!(x0 | x1 | x2 | x3)) { - output[0] = output[1] = output[2] = output[3] = 0; - return; - } - - s0 = sinpi_1_9 * x0; - s1 = sinpi_2_9 * x0; - s2 = sinpi_3_9 * x1; - s3 = sinpi_4_9 * x2; - s4 = sinpi_1_9 * x2; - s5 = sinpi_2_9 * x3; - s6 = sinpi_4_9 * x3; - s7 = WRAPLOW(x0 - x2 + x3); - - s0 = s0 + s3 + s5; - s1 = s1 - s4 - s6; - s3 = s2; - s2 = sinpi_3_9 * s7; - - // 1-D transform scaling factor is sqrt(2). - // The overall dynamic range is 14b (input) + 14b (multiplication scaling) - // + 1b (addition) = 29b. - // Hence the output bit depth is 15b. - output[0] = WRAPLOW(dct_const_round_shift(s0 + s3)); - output[1] = WRAPLOW(dct_const_round_shift(s1 + s3)); - output[2] = WRAPLOW(dct_const_round_shift(s2)); - output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3)); -} - -void av1_iadst8_c(const tran_low_t *input, tran_low_t *output) { - int s0, s1, s2, s3, s4, s5, s6, s7; - - tran_high_t x0 = input[7]; - tran_high_t x1 = input[0]; - tran_high_t x2 = input[5]; - tran_high_t x3 = input[2]; - tran_high_t x4 = input[3]; - tran_high_t x5 = input[4]; - tran_high_t x6 = input[1]; - tran_high_t x7 = input[6]; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { - output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = - output[6] = output[7] = 0; - return; - } - - // stage 1 - s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1); - s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1); - s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3); - s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3); - s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5); - s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5); - s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7); - s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7); - - x0 = WRAPLOW(dct_const_round_shift(s0 + s4)); - x1 = WRAPLOW(dct_const_round_shift(s1 + s5)); - x2 = WRAPLOW(dct_const_round_shift(s2 + s6)); - x3 = WRAPLOW(dct_const_round_shift(s3 + s7)); - x4 = WRAPLOW(dct_const_round_shift(s0 - s4)); - x5 = WRAPLOW(dct_const_round_shift(s1 - s5)); - x6 = WRAPLOW(dct_const_round_shift(s2 - s6)); - x7 = WRAPLOW(dct_const_round_shift(s3 - s7)); - - // stage 2 - s0 = (int)x0; - s1 = (int)x1; - s2 = (int)x2; - s3 = (int)x3; - s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5); - s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5); - s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7); - s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7); - - x0 = WRAPLOW(s0 + s2); - x1 = WRAPLOW(s1 + s3); - x2 = WRAPLOW(s0 - s2); - x3 = WRAPLOW(s1 - s3); - x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); - x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); - x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); - x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); - - // stage 3 - s2 = (int)(cospi_16_64 * (x2 + x3)); - s3 = (int)(cospi_16_64 * (x2 - x3)); - s6 = (int)(cospi_16_64 * (x6 + x7)); - s7 = (int)(cospi_16_64 * (x6 - x7)); - - x2 = WRAPLOW(dct_const_round_shift(s2)); - x3 = WRAPLOW(dct_const_round_shift(s3)); - x6 = WRAPLOW(dct_const_round_shift(s6)); - x7 = WRAPLOW(dct_const_round_shift(s7)); - - output[0] = WRAPLOW(x0); - output[1] = WRAPLOW(-x4); - output[2] = WRAPLOW(x6); - output[3] = WRAPLOW(-x2); - output[4] = WRAPLOW(x3); - output[5] = WRAPLOW(-x7); - output[6] = WRAPLOW(x5); - output[7] = WRAPLOW(-x1); -} - -void av1_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - tran_low_t out[8 * 8] = { 0 }; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[8], temp_out[8]; - - // First transform rows - // only first 4 row has non-zero coefs - for (i = 0; i < 4; ++i) { - av1_idct8_c(input, outptr); - input += 8; - outptr += 8; - } - - // Then transform columns - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - av1_idct8_c(temp_in, temp_out); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 5)); - } - } -} - -void av1_idct16_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step1[16], step2[16]; - tran_high_t temp1, temp2; - - // stage 1 - step1[0] = input[0 / 2]; - step1[1] = input[16 / 2]; - step1[2] = input[8 / 2]; - step1[3] = input[24 / 2]; - step1[4] = input[4 / 2]; - step1[5] = input[20 / 2]; - step1[6] = input[12 / 2]; - step1[7] = input[28 / 2]; - step1[8] = input[2 / 2]; - step1[9] = input[18 / 2]; - step1[10] = input[10 / 2]; - step1[11] = input[26 / 2]; - step1[12] = input[6 / 2]; - step1[13] = input[22 / 2]; - step1[14] = input[14 / 2]; - step1[15] = input[30 / 2]; - - // stage 2 - step2[0] = step1[0]; - step2[1] = step1[1]; - step2[2] = step1[2]; - step2[3] = step1[3]; - step2[4] = step1[4]; - step2[5] = step1[5]; - step2[6] = step1[6]; - step2[7] = step1[7]; - - temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; - temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(dct_const_round_shift(temp1)); - step2[15] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; - temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1)); - step2[14] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; - temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; - temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1)); - step2[12] = WRAPLOW(dct_const_round_shift(temp2)); - - // stage 3 - step1[0] = step2[0]; - step1[1] = step2[1]; - step1[2] = step2[2]; - step1[3] = step2[3]; - - temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; - temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1)); - step1[7] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; - temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); - - step1[8] = WRAPLOW(step2[8] + step2[9]); - step1[9] = WRAPLOW(step2[8] - step2[9]); - step1[10] = WRAPLOW(-step2[10] + step2[11]); - step1[11] = WRAPLOW(step2[10] + step2[11]); - step1[12] = WRAPLOW(step2[12] + step2[13]); - step1[13] = WRAPLOW(step2[12] - step2[13]); - step1[14] = WRAPLOW(-step2[14] + step2[15]); - step1[15] = WRAPLOW(step2[14] + step2[15]); - - // stage 4 - temp1 = (step1[0] + step1[1]) * cospi_16_64; - temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1)); - step2[1] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1)); - step2[3] = WRAPLOW(dct_const_round_shift(temp2)); - step2[4] = WRAPLOW(step1[4] + step1[5]); - step2[5] = WRAPLOW(step1[4] - step1[5]); - step2[6] = WRAPLOW(-step1[6] + step1[7]); - step2[7] = WRAPLOW(step1[6] + step1[7]); - - step2[8] = step1[8]; - step2[15] = step1[15]; - temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; - temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1)); - step2[14] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; - temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); - step2[11] = step1[11]; - step2[12] = step1[12]; - - // stage 5 - step1[0] = WRAPLOW(step2[0] + step2[3]); - step1[1] = WRAPLOW(step2[1] + step2[2]); - step1[2] = WRAPLOW(step2[1] - step2[2]); - step1[3] = WRAPLOW(step2[0] - step2[3]); - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); - step1[7] = step2[7]; - - step1[8] = WRAPLOW(step2[8] + step2[11]); - step1[9] = WRAPLOW(step2[9] + step2[10]); - step1[10] = WRAPLOW(step2[9] - step2[10]); - step1[11] = WRAPLOW(step2[8] - step2[11]); - step1[12] = WRAPLOW(-step2[12] + step2[15]); - step1[13] = WRAPLOW(-step2[13] + step2[14]); - step1[14] = WRAPLOW(step2[13] + step2[14]); - step1[15] = WRAPLOW(step2[12] + step2[15]); - - // stage 6 - step2[0] = WRAPLOW(step1[0] + step1[7]); - step2[1] = WRAPLOW(step1[1] + step1[6]); - step2[2] = WRAPLOW(step1[2] + step1[5]); - step2[3] = WRAPLOW(step1[3] + step1[4]); - step2[4] = WRAPLOW(step1[3] - step1[4]); - step2[5] = WRAPLOW(step1[2] - step1[5]); - step2[6] = WRAPLOW(step1[1] - step1[6]); - step2[7] = WRAPLOW(step1[0] - step1[7]); - step2[8] = step1[8]; - step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * cospi_16_64; - temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = (-step1[11] + step1[12]) * cospi_16_64; - temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1)); - step2[12] = WRAPLOW(dct_const_round_shift(temp2)); - step2[14] = step1[14]; - step2[15] = step1[15]; - - // stage 7 - output[0] = WRAPLOW(step2[0] + step2[15]); - output[1] = WRAPLOW(step2[1] + step2[14]); - output[2] = WRAPLOW(step2[2] + step2[13]); - output[3] = WRAPLOW(step2[3] + step2[12]); - output[4] = WRAPLOW(step2[4] + step2[11]); - output[5] = WRAPLOW(step2[5] + step2[10]); - output[6] = WRAPLOW(step2[6] + step2[9]); - output[7] = WRAPLOW(step2[7] + step2[8]); - output[8] = WRAPLOW(step2[7] - step2[8]); - output[9] = WRAPLOW(step2[6] - step2[9]); - output[10] = WRAPLOW(step2[5] - step2[10]); - output[11] = WRAPLOW(step2[4] - step2[11]); - output[12] = WRAPLOW(step2[3] - step2[12]); - output[13] = WRAPLOW(step2[2] - step2[13]); - output[14] = WRAPLOW(step2[1] - step2[14]); - output[15] = WRAPLOW(step2[0] - step2[15]); -} - -void av1_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, - int stride) { - tran_low_t out[16 * 16]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[16], temp_out[16]; - - // First transform rows - for (i = 0; i < 16; ++i) { - av1_idct16_c(input, outptr); - input += 16; - outptr += 16; - } - - // Then transform columns - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - av1_idct16_c(temp_in, temp_out); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 6)); - } - } -} - -void av1_iadst16_c(const tran_low_t *input, tran_low_t *output) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; - tran_high_t s9, s10, s11, s12, s13, s14, s15; - - tran_high_t x0 = input[15]; - tran_high_t x1 = input[0]; - tran_high_t x2 = input[13]; - tran_high_t x3 = input[2]; - tran_high_t x4 = input[11]; - tran_high_t x5 = input[4]; - tran_high_t x6 = input[9]; - tran_high_t x7 = input[6]; - tran_high_t x8 = input[7]; - tran_high_t x9 = input[8]; - tran_high_t x10 = input[5]; - tran_high_t x11 = input[10]; - tran_high_t x12 = input[3]; - tran_high_t x13 = input[12]; - tran_high_t x14 = input[1]; - tran_high_t x15 = input[14]; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | - x13 | x14 | x15)) { - output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = - output[6] = output[7] = output[8] = output[9] = output[10] = - output[11] = output[12] = output[13] = output[14] = output[15] = 0; - return; - } - - // stage 1 - s0 = x0 * cospi_1_64 + x1 * cospi_31_64; - s1 = x0 * cospi_31_64 - x1 * cospi_1_64; - s2 = x2 * cospi_5_64 + x3 * cospi_27_64; - s3 = x2 * cospi_27_64 - x3 * cospi_5_64; - s4 = x4 * cospi_9_64 + x5 * cospi_23_64; - s5 = x4 * cospi_23_64 - x5 * cospi_9_64; - s6 = x6 * cospi_13_64 + x7 * cospi_19_64; - s7 = x6 * cospi_19_64 - x7 * cospi_13_64; - s8 = x8 * cospi_17_64 + x9 * cospi_15_64; - s9 = x8 * cospi_15_64 - x9 * cospi_17_64; - s10 = x10 * cospi_21_64 + x11 * cospi_11_64; - s11 = x10 * cospi_11_64 - x11 * cospi_21_64; - s12 = x12 * cospi_25_64 + x13 * cospi_7_64; - s13 = x12 * cospi_7_64 - x13 * cospi_25_64; - s14 = x14 * cospi_29_64 + x15 * cospi_3_64; - s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - - x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); - x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); - x2 = WRAPLOW(dct_const_round_shift(s2 + s10)); - x3 = WRAPLOW(dct_const_round_shift(s3 + s11)); - x4 = WRAPLOW(dct_const_round_shift(s4 + s12)); - x5 = WRAPLOW(dct_const_round_shift(s5 + s13)); - x6 = WRAPLOW(dct_const_round_shift(s6 + s14)); - x7 = WRAPLOW(dct_const_round_shift(s7 + s15)); - x8 = WRAPLOW(dct_const_round_shift(s0 - s8)); - x9 = WRAPLOW(dct_const_round_shift(s1 - s9)); - x10 = WRAPLOW(dct_const_round_shift(s2 - s10)); - x11 = WRAPLOW(dct_const_round_shift(s3 - s11)); - x12 = WRAPLOW(dct_const_round_shift(s4 - s12)); - x13 = WRAPLOW(dct_const_round_shift(s5 - s13)); - x14 = WRAPLOW(dct_const_round_shift(s6 - s14)); - x15 = WRAPLOW(dct_const_round_shift(s7 - s15)); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4; - s5 = x5; - s6 = x6; - s7 = x7; - s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; - s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; - s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - - x0 = WRAPLOW(s0 + s4); - x1 = WRAPLOW(s1 + s5); - x2 = WRAPLOW(s2 + s6); - x3 = WRAPLOW(s3 + s7); - x4 = WRAPLOW(s0 - s4); - x5 = WRAPLOW(s1 - s5); - x6 = WRAPLOW(s2 - s6); - x7 = WRAPLOW(s3 - s7); - x8 = WRAPLOW(dct_const_round_shift(s8 + s12)); - x9 = WRAPLOW(dct_const_round_shift(s9 + s13)); - x10 = WRAPLOW(dct_const_round_shift(s10 + s14)); - x11 = WRAPLOW(dct_const_round_shift(s11 + s15)); - x12 = WRAPLOW(dct_const_round_shift(s8 - s12)); - x13 = WRAPLOW(dct_const_round_shift(s9 - s13)); - x14 = WRAPLOW(dct_const_round_shift(s10 - s14)); - x15 = WRAPLOW(dct_const_round_shift(s11 - s15)); - - // stage 3 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4 * cospi_8_64 + x5 * cospi_24_64; - s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; - s7 = x6 * cospi_8_64 + x7 * cospi_24_64; - s8 = x8; - s9 = x9; - s10 = x10; - s11 = x11; - s12 = x12 * cospi_8_64 + x13 * cospi_24_64; - s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; - s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - - x0 = WRAPLOW(s0 + s2); - x1 = WRAPLOW(s1 + s3); - x2 = WRAPLOW(s0 - s2); - x3 = WRAPLOW(s1 - s3); - x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); - x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); - x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); - x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); - x8 = WRAPLOW(s8 + s10); - x9 = WRAPLOW(s9 + s11); - x10 = WRAPLOW(s8 - s10); - x11 = WRAPLOW(s9 - s11); - x12 = WRAPLOW(dct_const_round_shift(s12 + s14)); - x13 = WRAPLOW(dct_const_round_shift(s13 + s15)); - x14 = WRAPLOW(dct_const_round_shift(s12 - s14)); - x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); - - // stage 4 - s2 = (-cospi_16_64) * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (-x6 + x7); - s10 = cospi_16_64 * (x10 + x11); - s11 = cospi_16_64 * (-x10 + x11); - s14 = (-cospi_16_64) * (x14 + x15); - s15 = cospi_16_64 * (x14 - x15); - - x2 = WRAPLOW(dct_const_round_shift(s2)); - x3 = WRAPLOW(dct_const_round_shift(s3)); - x6 = WRAPLOW(dct_const_round_shift(s6)); - x7 = WRAPLOW(dct_const_round_shift(s7)); - x10 = WRAPLOW(dct_const_round_shift(s10)); - x11 = WRAPLOW(dct_const_round_shift(s11)); - x14 = WRAPLOW(dct_const_round_shift(s14)); - x15 = WRAPLOW(dct_const_round_shift(s15)); - - output[0] = WRAPLOW(x0); - output[1] = WRAPLOW(-x8); - output[2] = WRAPLOW(x12); - output[3] = WRAPLOW(-x4); - output[4] = WRAPLOW(x6); - output[5] = WRAPLOW(x14); - output[6] = WRAPLOW(x10); - output[7] = WRAPLOW(x2); - output[8] = WRAPLOW(x3); - output[9] = WRAPLOW(x11); - output[10] = WRAPLOW(x15); - output[11] = WRAPLOW(x7); - output[12] = WRAPLOW(x5); - output[13] = WRAPLOW(-x13); - output[14] = WRAPLOW(x9); - output[15] = WRAPLOW(-x1); -} - -void av1_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, - int stride) { - tran_low_t out[16 * 16] = { 0 }; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[16], temp_out[16]; - - // First transform rows. Since all non-zero dct coefficients are in - // upper-left 4x4 area, we only need to calculate first 4 rows here. - for (i = 0; i < 4; ++i) { - av1_idct16_c(input, outptr); - input += 16; - outptr += 16; - } - - // Then transform columns - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - av1_idct16_c(temp_in, temp_out); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 6)); - } - } -} - -void av1_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - int i, j; - tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); - a1 = ROUND_POWER_OF_TWO(out, 6); - for (j = 0; j < 16; ++j) { - for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1); - dest += stride; - } -} - -void av1_idct32_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step1[32], step2[32]; - tran_high_t temp1, temp2; - - // stage 1 - step1[0] = input[0]; - step1[1] = input[16]; - step1[2] = input[8]; - step1[3] = input[24]; - step1[4] = input[4]; - step1[5] = input[20]; - step1[6] = input[12]; - step1[7] = input[28]; - step1[8] = input[2]; - step1[9] = input[18]; - step1[10] = input[10]; - step1[11] = input[26]; - step1[12] = input[6]; - step1[13] = input[22]; - step1[14] = input[14]; - step1[15] = input[30]; - - temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; - temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; - step1[16] = WRAPLOW(dct_const_round_shift(temp1)); - step1[31] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; - temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; - step1[17] = WRAPLOW(dct_const_round_shift(temp1)); - step1[30] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; - temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1)); - step1[29] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; - temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; - step1[19] = WRAPLOW(dct_const_round_shift(temp1)); - step1[28] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; - temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1)); - step1[27] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; - temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1)); - step1[26] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; - temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1)); - step1[25] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; - temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; - step1[23] = WRAPLOW(dct_const_round_shift(temp1)); - step1[24] = WRAPLOW(dct_const_round_shift(temp2)); - - // stage 2 - step2[0] = step1[0]; - step2[1] = step1[1]; - step2[2] = step1[2]; - step2[3] = step1[3]; - step2[4] = step1[4]; - step2[5] = step1[5]; - step2[6] = step1[6]; - step2[7] = step1[7]; - - temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; - temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(dct_const_round_shift(temp1)); - step2[15] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; - temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1)); - step2[14] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; - temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; - temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1)); - step2[12] = WRAPLOW(dct_const_round_shift(temp2)); - - step2[16] = WRAPLOW(step1[16] + step1[17]); - step2[17] = WRAPLOW(step1[16] - step1[17]); - step2[18] = WRAPLOW(-step1[18] + step1[19]); - step2[19] = WRAPLOW(step1[18] + step1[19]); - step2[20] = WRAPLOW(step1[20] + step1[21]); - step2[21] = WRAPLOW(step1[20] - step1[21]); - step2[22] = WRAPLOW(-step1[22] + step1[23]); - step2[23] = WRAPLOW(step1[22] + step1[23]); - step2[24] = WRAPLOW(step1[24] + step1[25]); - step2[25] = WRAPLOW(step1[24] - step1[25]); - step2[26] = WRAPLOW(-step1[26] + step1[27]); - step2[27] = WRAPLOW(step1[26] + step1[27]); - step2[28] = WRAPLOW(step1[28] + step1[29]); - step2[29] = WRAPLOW(step1[28] - step1[29]); - step2[30] = WRAPLOW(-step1[30] + step1[31]); - step2[31] = WRAPLOW(step1[30] + step1[31]); - - // stage 3 - step1[0] = step2[0]; - step1[1] = step2[1]; - step1[2] = step2[2]; - step1[3] = step2[3]; - - temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; - temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1)); - step1[7] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; - temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); - - step1[8] = WRAPLOW(step2[8] + step2[9]); - step1[9] = WRAPLOW(step2[8] - step2[9]); - step1[10] = WRAPLOW(-step2[10] + step2[11]); - step1[11] = WRAPLOW(step2[10] + step2[11]); - step1[12] = WRAPLOW(step2[12] + step2[13]); - step1[13] = WRAPLOW(step2[12] - step2[13]); - step1[14] = WRAPLOW(-step2[14] + step2[15]); - step1[15] = WRAPLOW(step2[14] + step2[15]); - - step1[16] = step2[16]; - step1[31] = step2[31]; - temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; - temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; - step1[17] = WRAPLOW(dct_const_round_shift(temp1)); - step1[30] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; - temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1)); - step1[29] = WRAPLOW(dct_const_round_shift(temp2)); - step1[19] = step2[19]; - step1[20] = step2[20]; - temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; - temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1)); - step1[26] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; - temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1)); - step1[25] = WRAPLOW(dct_const_round_shift(temp2)); - step1[23] = step2[23]; - step1[24] = step2[24]; - step1[27] = step2[27]; - step1[28] = step2[28]; - - // stage 4 - temp1 = (step1[0] + step1[1]) * cospi_16_64; - temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1)); - step2[1] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1)); - step2[3] = WRAPLOW(dct_const_round_shift(temp2)); - step2[4] = WRAPLOW(step1[4] + step1[5]); - step2[5] = WRAPLOW(step1[4] - step1[5]); - step2[6] = WRAPLOW(-step1[6] + step1[7]); - step2[7] = WRAPLOW(step1[6] + step1[7]); - - step2[8] = step1[8]; - step2[15] = step1[15]; - temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; - temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1)); - step2[14] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; - temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); - step2[11] = step1[11]; - step2[12] = step1[12]; - - step2[16] = WRAPLOW(step1[16] + step1[19]); - step2[17] = WRAPLOW(step1[17] + step1[18]); - step2[18] = WRAPLOW(step1[17] - step1[18]); - step2[19] = WRAPLOW(step1[16] - step1[19]); - step2[20] = WRAPLOW(-step1[20] + step1[23]); - step2[21] = WRAPLOW(-step1[21] + step1[22]); - step2[22] = WRAPLOW(step1[21] + step1[22]); - step2[23] = WRAPLOW(step1[20] + step1[23]); - - step2[24] = WRAPLOW(step1[24] + step1[27]); - step2[25] = WRAPLOW(step1[25] + step1[26]); - step2[26] = WRAPLOW(step1[25] - step1[26]); - step2[27] = WRAPLOW(step1[24] - step1[27]); - step2[28] = WRAPLOW(-step1[28] + step1[31]); - step2[29] = WRAPLOW(-step1[29] + step1[30]); - step2[30] = WRAPLOW(step1[29] + step1[30]); - step2[31] = WRAPLOW(step1[28] + step1[31]); - - // stage 5 - step1[0] = WRAPLOW(step2[0] + step2[3]); - step1[1] = WRAPLOW(step2[1] + step2[2]); - step1[2] = WRAPLOW(step2[1] - step2[2]); - step1[3] = WRAPLOW(step2[0] - step2[3]); - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); - step1[7] = step2[7]; - - step1[8] = WRAPLOW(step2[8] + step2[11]); - step1[9] = WRAPLOW(step2[9] + step2[10]); - step1[10] = WRAPLOW(step2[9] - step2[10]); - step1[11] = WRAPLOW(step2[8] - step2[11]); - step1[12] = WRAPLOW(-step2[12] + step2[15]); - step1[13] = WRAPLOW(-step2[13] + step2[14]); - step1[14] = WRAPLOW(step2[13] + step2[14]); - step1[15] = WRAPLOW(step2[12] + step2[15]); - - step1[16] = step2[16]; - step1[17] = step2[17]; - temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; - temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1)); - step1[29] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; - temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; - step1[19] = WRAPLOW(dct_const_round_shift(temp1)); - step1[28] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; - temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1)); - step1[27] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; - temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1)); - step1[26] = WRAPLOW(dct_const_round_shift(temp2)); - step1[22] = step2[22]; - step1[23] = step2[23]; - step1[24] = step2[24]; - step1[25] = step2[25]; - step1[30] = step2[30]; - step1[31] = step2[31]; - - // stage 6 - step2[0] = WRAPLOW(step1[0] + step1[7]); - step2[1] = WRAPLOW(step1[1] + step1[6]); - step2[2] = WRAPLOW(step1[2] + step1[5]); - step2[3] = WRAPLOW(step1[3] + step1[4]); - step2[4] = WRAPLOW(step1[3] - step1[4]); - step2[5] = WRAPLOW(step1[2] - step1[5]); - step2[6] = WRAPLOW(step1[1] - step1[6]); - step2[7] = WRAPLOW(step1[0] - step1[7]); - step2[8] = step1[8]; - step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * cospi_16_64; - temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = (-step1[11] + step1[12]) * cospi_16_64; - temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1)); - step2[12] = WRAPLOW(dct_const_round_shift(temp2)); - step2[14] = step1[14]; - step2[15] = step1[15]; - - step2[16] = WRAPLOW(step1[16] + step1[23]); - step2[17] = WRAPLOW(step1[17] + step1[22]); - step2[18] = WRAPLOW(step1[18] + step1[21]); - step2[19] = WRAPLOW(step1[19] + step1[20]); - step2[20] = WRAPLOW(step1[19] - step1[20]); - step2[21] = WRAPLOW(step1[18] - step1[21]); - step2[22] = WRAPLOW(step1[17] - step1[22]); - step2[23] = WRAPLOW(step1[16] - step1[23]); - - step2[24] = WRAPLOW(-step1[24] + step1[31]); - step2[25] = WRAPLOW(-step1[25] + step1[30]); - step2[26] = WRAPLOW(-step1[26] + step1[29]); - step2[27] = WRAPLOW(-step1[27] + step1[28]); - step2[28] = WRAPLOW(step1[27] + step1[28]); - step2[29] = WRAPLOW(step1[26] + step1[29]); - step2[30] = WRAPLOW(step1[25] + step1[30]); - step2[31] = WRAPLOW(step1[24] + step1[31]); - - // stage 7 - step1[0] = WRAPLOW(step2[0] + step2[15]); - step1[1] = WRAPLOW(step2[1] + step2[14]); - step1[2] = WRAPLOW(step2[2] + step2[13]); - step1[3] = WRAPLOW(step2[3] + step2[12]); - step1[4] = WRAPLOW(step2[4] + step2[11]); - step1[5] = WRAPLOW(step2[5] + step2[10]); - step1[6] = WRAPLOW(step2[6] + step2[9]); - step1[7] = WRAPLOW(step2[7] + step2[8]); - step1[8] = WRAPLOW(step2[7] - step2[8]); - step1[9] = WRAPLOW(step2[6] - step2[9]); - step1[10] = WRAPLOW(step2[5] - step2[10]); - step1[11] = WRAPLOW(step2[4] - step2[11]); - step1[12] = WRAPLOW(step2[3] - step2[12]); - step1[13] = WRAPLOW(step2[2] - step2[13]); - step1[14] = WRAPLOW(step2[1] - step2[14]); - step1[15] = WRAPLOW(step2[0] - step2[15]); - - step1[16] = step2[16]; - step1[17] = step2[17]; - step1[18] = step2[18]; - step1[19] = step2[19]; - temp1 = (-step2[20] + step2[27]) * cospi_16_64; - temp2 = (step2[20] + step2[27]) * cospi_16_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1)); - step1[27] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = (-step2[21] + step2[26]) * cospi_16_64; - temp2 = (step2[21] + step2[26]) * cospi_16_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1)); - step1[26] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = (-step2[22] + step2[25]) * cospi_16_64; - temp2 = (step2[22] + step2[25]) * cospi_16_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1)); - step1[25] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = (-step2[23] + step2[24]) * cospi_16_64; - temp2 = (step2[23] + step2[24]) * cospi_16_64; - step1[23] = WRAPLOW(dct_const_round_shift(temp1)); - step1[24] = WRAPLOW(dct_const_round_shift(temp2)); - step1[28] = step2[28]; - step1[29] = step2[29]; - step1[30] = step2[30]; - step1[31] = step2[31]; - - // final stage - output[0] = WRAPLOW(step1[0] + step1[31]); - output[1] = WRAPLOW(step1[1] + step1[30]); - output[2] = WRAPLOW(step1[2] + step1[29]); - output[3] = WRAPLOW(step1[3] + step1[28]); - output[4] = WRAPLOW(step1[4] + step1[27]); - output[5] = WRAPLOW(step1[5] + step1[26]); - output[6] = WRAPLOW(step1[6] + step1[25]); - output[7] = WRAPLOW(step1[7] + step1[24]); - output[8] = WRAPLOW(step1[8] + step1[23]); - output[9] = WRAPLOW(step1[9] + step1[22]); - output[10] = WRAPLOW(step1[10] + step1[21]); - output[11] = WRAPLOW(step1[11] + step1[20]); - output[12] = WRAPLOW(step1[12] + step1[19]); - output[13] = WRAPLOW(step1[13] + step1[18]); - output[14] = WRAPLOW(step1[14] + step1[17]); - output[15] = WRAPLOW(step1[15] + step1[16]); - output[16] = WRAPLOW(step1[15] - step1[16]); - output[17] = WRAPLOW(step1[14] - step1[17]); - output[18] = WRAPLOW(step1[13] - step1[18]); - output[19] = WRAPLOW(step1[12] - step1[19]); - output[20] = WRAPLOW(step1[11] - step1[20]); - output[21] = WRAPLOW(step1[10] - step1[21]); - output[22] = WRAPLOW(step1[9] - step1[22]); - output[23] = WRAPLOW(step1[8] - step1[23]); - output[24] = WRAPLOW(step1[7] - step1[24]); - output[25] = WRAPLOW(step1[6] - step1[25]); - output[26] = WRAPLOW(step1[5] - step1[26]); - output[27] = WRAPLOW(step1[4] - step1[27]); - output[28] = WRAPLOW(step1[3] - step1[28]); - output[29] = WRAPLOW(step1[2] - step1[29]); - output[30] = WRAPLOW(step1[1] - step1[30]); - output[31] = WRAPLOW(step1[0] - step1[31]); -} - -void av1_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, - int stride) { - tran_low_t out[32 * 32]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[32], temp_out[32]; - - // Rows - for (i = 0; i < 32; ++i) { - int16_t zero_coeff[16]; - for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1]; - for (j = 0; j < 8; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 4; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 2; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - - if (zero_coeff[0] | zero_coeff[1]) - av1_idct32_c(input, outptr); - else - memset(outptr, 0, sizeof(tran_low_t) * 32); - input += 32; - outptr += 32; - } - - // Columns - for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - av1_idct32_c(temp_in, temp_out); - for (j = 0; j < 32; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 6)); - } - } -} - -void av1_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, - int stride) { - tran_low_t out[32 * 32] = { 0 }; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[32], temp_out[32]; - - // Rows - // only upper-left 8x8 has non-zero coeff - for (i = 0; i < 8; ++i) { - av1_idct32_c(input, outptr); - input += 32; - outptr += 32; - } - - // Columns - for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - av1_idct32_c(temp_in, temp_out); - for (j = 0; j < 32; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 6)); - } - } -} - -void av1_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - int i, j; - tran_high_t a1; - - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); - a1 = ROUND_POWER_OF_TWO(out, 6); - - for (j = 0; j < 32; ++j) { - for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1); - dest += stride; - } -} - -#if CONFIG_AOM_HIGHBITDEPTH -void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, - 0.5 shifts per pixel. */ - int i; - tran_low_t output[16]; - tran_high_t a1, b1, c1, d1, e1; - const tran_low_t *ip = input; - tran_low_t *op = output; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - for (i = 0; i < 4; i++) { - a1 = ip[0] >> UNIT_QUANT_SHIFT; - c1 = ip[1] >> UNIT_QUANT_SHIFT; - d1 = ip[2] >> UNIT_QUANT_SHIFT; - b1 = ip[3] >> UNIT_QUANT_SHIFT; - a1 += c1; - d1 -= b1; - e1 = (a1 - d1) >> 1; - b1 = e1 - b1; - c1 = e1 - c1; - a1 -= b1; - d1 += c1; - op[0] = HIGHBD_WRAPLOW(a1, bd); - op[1] = HIGHBD_WRAPLOW(b1, bd); - op[2] = HIGHBD_WRAPLOW(c1, bd); - op[3] = HIGHBD_WRAPLOW(d1, bd); - ip += 4; - op += 4; - } - - ip = output; - for (i = 0; i < 4; i++) { - a1 = ip[4 * 0]; - c1 = ip[4 * 1]; - d1 = ip[4 * 2]; - b1 = ip[4 * 3]; - a1 += c1; - d1 -= b1; - e1 = (a1 - d1) >> 1; - b1 = e1 - b1; - c1 = e1 - c1; - a1 -= b1; - d1 += c1; - dest[stride * 0] = - highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd); - dest[stride * 1] = - highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd); - dest[stride * 2] = - highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd); - dest[stride * 3] = - highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd); - ip++; - dest++; - } -} - -void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, - int dest_stride, int bd) { - int i; - tran_high_t a1, e1; - tran_low_t tmp[4]; - const tran_low_t *ip = in; - tran_low_t *op = tmp; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - (void)bd; - - a1 = ip[0] >> UNIT_QUANT_SHIFT; - e1 = a1 >> 1; - a1 -= e1; - op[0] = HIGHBD_WRAPLOW(a1, bd); - op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd); - - ip = tmp; - for (i = 0; i < 4; i++) { - e1 = ip[0] >> 1; - a1 = ip[0] - e1; - dest[dest_stride * 0] = - highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd); - dest[dest_stride * 1] = - highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd); - dest[dest_stride * 2] = - highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd); - dest[dest_stride * 3] = - highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd); - ip++; - dest++; - } -} - -void av1_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_low_t step[4]; - tran_high_t temp1, temp2; - (void)bd; - // stage 1 - temp1 = (input[0] + input[2]) * cospi_16_64; - temp2 = (input[0] - input[2]) * cospi_16_64; - step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - // stage 2 - output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd); - output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd); - output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd); - output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd); -} - -void av1_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[4], temp_out[4]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // Rows - for (i = 0; i < 4; ++i) { - av1_highbd_idct4_c(input, outptr, bd); - input += 4; - outptr += 4; - } - - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - av1_highbd_idct4_c(temp_in, temp_out, bd); - for (j = 0; j < 4; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); - } - } -} - -void av1_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, - int dest_stride, int bd) { - int i; - tran_high_t a1; - tran_low_t out = - HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); - a1 = ROUND_POWER_OF_TWO(out, 4); - - for (i = 0; i < 4; i++) { - dest[0] = highbd_clip_pixel_add(dest[0], a1, bd); - dest[1] = highbd_clip_pixel_add(dest[1], a1, bd); - dest[2] = highbd_clip_pixel_add(dest[2], a1, bd); - dest[3] = highbd_clip_pixel_add(dest[3], a1, bd); - dest += dest_stride; - } -} - -void av1_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_low_t step1[8], step2[8]; - tran_high_t temp1, temp2; - // stage 1 - step1[0] = input[0]; - step1[2] = input[4]; - step1[1] = input[2]; - step1[3] = input[6]; - temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; - temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; - temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - // stage 2 & stage 3 - even half - av1_highbd_idct4_c(step1, step1, bd); - - // stage 2 - odd half - step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); - step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); - step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); - step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); - - // stage 3 - odd half - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - step1[7] = step2[7]; - - // stage 4 - output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); - output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); - output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); - output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); - output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); - output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); - output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); - output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); -} - -void av1_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[8], temp_out[8]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // First transform rows. - for (i = 0; i < 8; ++i) { - av1_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - - // Then transform columns. - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - av1_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } -} - -void av1_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - int i, j; - tran_high_t a1; - tran_low_t out = - HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); - a1 = ROUND_POWER_OF_TWO(out, 5); - for (j = 0; j < 8; ++j) { - for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); - dest += stride; - } -} - -void av1_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - - tran_low_t x0 = input[0]; - tran_low_t x1 = input[1]; - tran_low_t x2 = input[2]; - tran_low_t x3 = input[3]; - (void)bd; - - if (!(x0 | x1 | x2 | x3)) { - memset(output, 0, 4 * sizeof(*output)); - return; - } - - s0 = sinpi_1_9 * x0; - s1 = sinpi_2_9 * x0; - s2 = sinpi_3_9 * x1; - s3 = sinpi_4_9 * x2; - s4 = sinpi_1_9 * x2; - s5 = sinpi_2_9 * x3; - s6 = sinpi_4_9 * x3; - s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd); - - s0 = s0 + s3 + s5; - s1 = s1 - s4 - s6; - s3 = s2; - s2 = sinpi_3_9 * s7; - - // 1-D transform scaling factor is sqrt(2). - // The overall dynamic range is 14b (input) + 14b (multiplication scaling) - // + 1b (addition) = 29b. - // Hence the output bit depth is 15b. - output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd); - output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd); - output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); - output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd); -} - -void av1_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - - tran_low_t x0 = input[7]; - tran_low_t x1 = input[0]; - tran_low_t x2 = input[5]; - tran_low_t x3 = input[2]; - tran_low_t x4 = input[3]; - tran_low_t x5 = input[4]; - tran_low_t x6 = input[1]; - tran_low_t x7 = input[6]; - (void)bd; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { - memset(output, 0, 8 * sizeof(*output)); - return; - } - - // stage 1 - s0 = cospi_2_64 * x0 + cospi_30_64 * x1; - s1 = cospi_30_64 * x0 - cospi_2_64 * x1; - s2 = cospi_10_64 * x2 + cospi_22_64 * x3; - s3 = cospi_22_64 * x2 - cospi_10_64 * x3; - s4 = cospi_18_64 * x4 + cospi_14_64 * x5; - s5 = cospi_14_64 * x4 - cospi_18_64 * x5; - s6 = cospi_26_64 * x6 + cospi_6_64 * x7; - s7 = cospi_6_64 * x6 - cospi_26_64 * x7; - - x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd); - x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd); - x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd); - x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd); - x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd); - x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd); - x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd); - x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = cospi_8_64 * x4 + cospi_24_64 * x5; - s5 = cospi_24_64 * x4 - cospi_8_64 * x5; - s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; - s7 = cospi_8_64 * x6 + cospi_24_64 * x7; - - x0 = HIGHBD_WRAPLOW(s0 + s2, bd); - x1 = HIGHBD_WRAPLOW(s1 + s3, bd); - x2 = HIGHBD_WRAPLOW(s0 - s2, bd); - x3 = HIGHBD_WRAPLOW(s1 - s3, bd); - x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd); - x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd); - x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd); - x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd); - - // stage 3 - s2 = cospi_16_64 * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (x6 - x7); - - x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); - x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd); - x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd); - x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd); - - output[0] = HIGHBD_WRAPLOW(x0, bd); - output[1] = HIGHBD_WRAPLOW(-x4, bd); - output[2] = HIGHBD_WRAPLOW(x6, bd); - output[3] = HIGHBD_WRAPLOW(-x2, bd); - output[4] = HIGHBD_WRAPLOW(x3, bd); - output[5] = HIGHBD_WRAPLOW(-x7, bd); - output[6] = HIGHBD_WRAPLOW(x5, bd); - output[7] = HIGHBD_WRAPLOW(-x1, bd); -} - -void av1_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8] = { 0 }; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[8], temp_out[8]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // First transform rows. - // Only first 4 row has non-zero coefs. - for (i = 0; i < 4; ++i) { - av1_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - // Then transform columns. - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - av1_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } -} - -void av1_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_low_t step1[16], step2[16]; - tran_high_t temp1, temp2; - (void)bd; - - // stage 1 - step1[0] = input[0 / 2]; - step1[1] = input[16 / 2]; - step1[2] = input[8 / 2]; - step1[3] = input[24 / 2]; - step1[4] = input[4 / 2]; - step1[5] = input[20 / 2]; - step1[6] = input[12 / 2]; - step1[7] = input[28 / 2]; - step1[8] = input[2 / 2]; - step1[9] = input[18 / 2]; - step1[10] = input[10 / 2]; - step1[11] = input[26 / 2]; - step1[12] = input[6 / 2]; - step1[13] = input[22 / 2]; - step1[14] = input[14 / 2]; - step1[15] = input[30 / 2]; - - // stage 2 - step2[0] = step1[0]; - step2[1] = step1[1]; - step2[2] = step1[2]; - step2[3] = step1[3]; - step2[4] = step1[4]; - step2[5] = step1[5]; - step2[6] = step1[6]; - step2[7] = step1[7]; - - temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; - temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; - temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; - temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; - temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - // stage 3 - step1[0] = step2[0]; - step1[1] = step2[1]; - step1[2] = step2[2]; - step1[3] = step2[3]; - - temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; - temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; - temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); - step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); - step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd); - step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd); - step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd); - step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd); - step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd); - step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); - - // stage 4 - temp1 = (step1[0] + step1[1]) * cospi_16_64; - temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); - step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); - step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); - step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); - - step2[8] = step1[8]; - step2[15] = step1[15]; - temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; - temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; - temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - step2[11] = step1[11]; - step2[12] = step1[12]; - - // stage 5 - step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd); - step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd); - step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); - step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - step1[7] = step2[7]; - - step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); - step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd); - step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd); - step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd); - step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd); - step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd); - step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd); - step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd); - - // stage 6 - step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); - step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); - step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); - step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); - step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); - step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); - step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); - step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); - step2[8] = step1[8]; - step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * cospi_16_64; - temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = (-step1[11] + step1[12]) * cospi_16_64; - temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - step2[14] = step1[14]; - step2[15] = step1[15]; - - // stage 7 - output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd); - output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd); - output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd); - output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd); - output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd); - output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd); - output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd); - output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd); - output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd); - output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd); - output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd); - output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd); - output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd); - output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd); - output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd); - output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); -} - -void av1_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[16 * 16]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[16], temp_out[16]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // First transform rows. - for (i = 0; i < 16; ++i) { - av1_highbd_idct16_c(input, outptr, bd); - input += 16; - outptr += 16; - } - - // Then transform columns. - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - av1_highbd_idct16_c(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } -} - -void av1_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; - tran_high_t s9, s10, s11, s12, s13, s14, s15; - - tran_low_t x0 = input[15]; - tran_low_t x1 = input[0]; - tran_low_t x2 = input[13]; - tran_low_t x3 = input[2]; - tran_low_t x4 = input[11]; - tran_low_t x5 = input[4]; - tran_low_t x6 = input[9]; - tran_low_t x7 = input[6]; - tran_low_t x8 = input[7]; - tran_low_t x9 = input[8]; - tran_low_t x10 = input[5]; - tran_low_t x11 = input[10]; - tran_low_t x12 = input[3]; - tran_low_t x13 = input[12]; - tran_low_t x14 = input[1]; - tran_low_t x15 = input[14]; - (void)bd; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | - x13 | x14 | x15)) { - memset(output, 0, 16 * sizeof(*output)); - return; - } - - // stage 1 - s0 = x0 * cospi_1_64 + x1 * cospi_31_64; - s1 = x0 * cospi_31_64 - x1 * cospi_1_64; - s2 = x2 * cospi_5_64 + x3 * cospi_27_64; - s3 = x2 * cospi_27_64 - x3 * cospi_5_64; - s4 = x4 * cospi_9_64 + x5 * cospi_23_64; - s5 = x4 * cospi_23_64 - x5 * cospi_9_64; - s6 = x6 * cospi_13_64 + x7 * cospi_19_64; - s7 = x6 * cospi_19_64 - x7 * cospi_13_64; - s8 = x8 * cospi_17_64 + x9 * cospi_15_64; - s9 = x8 * cospi_15_64 - x9 * cospi_17_64; - s10 = x10 * cospi_21_64 + x11 * cospi_11_64; - s11 = x10 * cospi_11_64 - x11 * cospi_21_64; - s12 = x12 * cospi_25_64 + x13 * cospi_7_64; - s13 = x12 * cospi_7_64 - x13 * cospi_25_64; - s14 = x14 * cospi_29_64 + x15 * cospi_3_64; - s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - - x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd); - x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd); - x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd); - x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd); - x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd); - x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd); - x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd); - x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd); - x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd); - x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd); - x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd); - x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd); - x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd); - x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd); - x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd); - x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4; - s5 = x5; - s6 = x6; - s7 = x7; - s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; - s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; - s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - - x0 = HIGHBD_WRAPLOW(s0 + s4, bd); - x1 = HIGHBD_WRAPLOW(s1 + s5, bd); - x2 = HIGHBD_WRAPLOW(s2 + s6, bd); - x3 = HIGHBD_WRAPLOW(s3 + s7, bd); - x4 = HIGHBD_WRAPLOW(s0 - s4, bd); - x5 = HIGHBD_WRAPLOW(s1 - s5, bd); - x6 = HIGHBD_WRAPLOW(s2 - s6, bd); - x7 = HIGHBD_WRAPLOW(s3 - s7, bd); - x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd); - x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd); - x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd); - x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd); - x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd); - x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd); - x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd); - x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd); - - // stage 3 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4 * cospi_8_64 + x5 * cospi_24_64; - s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; - s7 = x6 * cospi_8_64 + x7 * cospi_24_64; - s8 = x8; - s9 = x9; - s10 = x10; - s11 = x11; - s12 = x12 * cospi_8_64 + x13 * cospi_24_64; - s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; - s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - - x0 = HIGHBD_WRAPLOW(s0 + s2, bd); - x1 = HIGHBD_WRAPLOW(s1 + s3, bd); - x2 = HIGHBD_WRAPLOW(s0 - s2, bd); - x3 = HIGHBD_WRAPLOW(s1 - s3, bd); - x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd); - x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd); - x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd); - x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd); - x8 = HIGHBD_WRAPLOW(s8 + s10, bd); - x9 = HIGHBD_WRAPLOW(s9 + s11, bd); - x10 = HIGHBD_WRAPLOW(s8 - s10, bd); - x11 = HIGHBD_WRAPLOW(s9 - s11, bd); - x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd); - x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd); - x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd); - x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd); - - // stage 4 - s2 = (-cospi_16_64) * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (-x6 + x7); - s10 = cospi_16_64 * (x10 + x11); - s11 = cospi_16_64 * (-x10 + x11); - s14 = (-cospi_16_64) * (x14 + x15); - s15 = cospi_16_64 * (x14 - x15); - - x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); - x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd); - x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd); - x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd); - x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd); - x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd); - x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd); - x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd); - - output[0] = HIGHBD_WRAPLOW(x0, bd); - output[1] = HIGHBD_WRAPLOW(-x8, bd); - output[2] = HIGHBD_WRAPLOW(x12, bd); - output[3] = HIGHBD_WRAPLOW(-x4, bd); - output[4] = HIGHBD_WRAPLOW(x6, bd); - output[5] = HIGHBD_WRAPLOW(x14, bd); - output[6] = HIGHBD_WRAPLOW(x10, bd); - output[7] = HIGHBD_WRAPLOW(x2, bd); - output[8] = HIGHBD_WRAPLOW(x3, bd); - output[9] = HIGHBD_WRAPLOW(x11, bd); - output[10] = HIGHBD_WRAPLOW(x15, bd); - output[11] = HIGHBD_WRAPLOW(x7, bd); - output[12] = HIGHBD_WRAPLOW(x5, bd); - output[13] = HIGHBD_WRAPLOW(-x13, bd); - output[14] = HIGHBD_WRAPLOW(x9, bd); - output[15] = HIGHBD_WRAPLOW(-x1, bd); -} - -void av1_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[16 * 16] = { 0 }; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[16], temp_out[16]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // First transform rows. Since all non-zero dct coefficients are in - // upper-left 4x4 area, we only need to calculate first 4 rows here. - for (i = 0; i < 4; ++i) { - av1_highbd_idct16_c(input, outptr, bd); - input += 16; - outptr += 16; - } - - // Then transform columns. - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - av1_highbd_idct16_c(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } -} - -void av1_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - int i, j; - tran_high_t a1; - tran_low_t out = - HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); - a1 = ROUND_POWER_OF_TWO(out, 6); - for (j = 0; j < 16; ++j) { - for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); - dest += stride; - } -} - -static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, - int bd) { - tran_low_t step1[32], step2[32]; - tran_high_t temp1, temp2; - (void)bd; - - // stage 1 - step1[0] = input[0]; - step1[1] = input[16]; - step1[2] = input[8]; - step1[3] = input[24]; - step1[4] = input[4]; - step1[5] = input[20]; - step1[6] = input[12]; - step1[7] = input[28]; - step1[8] = input[2]; - step1[9] = input[18]; - step1[10] = input[10]; - step1[11] = input[26]; - step1[12] = input[6]; - step1[13] = input[22]; - step1[14] = input[14]; - step1[15] = input[30]; - - temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; - temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; - step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; - temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; - step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; - temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; - step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; - temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; - step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; - temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; - step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; - temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; - step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; - temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; - step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; - temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; - step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - // stage 2 - step2[0] = step1[0]; - step2[1] = step1[1]; - step2[2] = step1[2]; - step2[3] = step1[3]; - step2[4] = step1[4]; - step2[5] = step1[5]; - step2[6] = step1[6]; - step2[7] = step1[7]; - - temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; - temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; - temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; - temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; - temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd); - step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd); - step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd); - step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd); - step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd); - step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd); - step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd); - step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd); - step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd); - step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd); - step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd); - step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd); - step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd); - step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd); - step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd); - step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd); - - // stage 3 - step1[0] = step2[0]; - step1[1] = step2[1]; - step1[2] = step2[2]; - step1[3] = step2[3]; - - temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; - temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; - temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - - step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); - step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); - step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd); - step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd); - step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd); - step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd); - step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd); - step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); - - step1[16] = step2[16]; - step1[31] = step2[31]; - temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; - temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; - step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; - temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; - step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - step1[19] = step2[19]; - step1[20] = step2[20]; - temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; - temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; - step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; - temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; - step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - step1[23] = step2[23]; - step1[24] = step2[24]; - step1[27] = step2[27]; - step1[28] = step2[28]; - - // stage 4 - temp1 = (step1[0] + step1[1]) * cospi_16_64; - temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); - step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); - step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); - step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); - - step2[8] = step1[8]; - step2[15] = step1[15]; - temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; - temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; - temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - step2[11] = step1[11]; - step2[12] = step1[12]; - - step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd); - step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd); - step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd); - step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd); - step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd); - step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd); - step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd); - step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd); - - step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd); - step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd); - step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd); - step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd); - step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd); - step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd); - step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd); - step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd); - - // stage 5 - step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd); - step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd); - step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); - step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - step1[7] = step2[7]; - - step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); - step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd); - step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd); - step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd); - step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd); - step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd); - step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd); - step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd); - - step1[16] = step2[16]; - step1[17] = step2[17]; - temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; - temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; - step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; - temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; - step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; - temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; - step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; - temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; - step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - step1[22] = step2[22]; - step1[23] = step2[23]; - step1[24] = step2[24]; - step1[25] = step2[25]; - step1[30] = step2[30]; - step1[31] = step2[31]; - - // stage 6 - step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); - step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); - step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); - step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); - step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); - step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); - step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); - step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); - step2[8] = step1[8]; - step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * cospi_16_64; - temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = (-step1[11] + step1[12]) * cospi_16_64; - temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - step2[14] = step1[14]; - step2[15] = step1[15]; - - step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd); - step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd); - step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd); - step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd); - step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd); - step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd); - step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd); - step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd); - - step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd); - step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd); - step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd); - step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd); - step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd); - step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd); - step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd); - step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd); - - // stage 7 - step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd); - step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd); - step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd); - step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd); - step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd); - step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd); - step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd); - step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd); - step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd); - step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd); - step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd); - step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd); - step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd); - step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd); - step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd); - step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); - - step1[16] = step2[16]; - step1[17] = step2[17]; - step1[18] = step2[18]; - step1[19] = step2[19]; - temp1 = (-step2[20] + step2[27]) * cospi_16_64; - temp2 = (step2[20] + step2[27]) * cospi_16_64; - step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = (-step2[21] + step2[26]) * cospi_16_64; - temp2 = (step2[21] + step2[26]) * cospi_16_64; - step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = (-step2[22] + step2[25]) * cospi_16_64; - temp2 = (step2[22] + step2[25]) * cospi_16_64; - step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = (-step2[23] + step2[24]) * cospi_16_64; - temp2 = (step2[23] + step2[24]) * cospi_16_64; - step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - step1[28] = step2[28]; - step1[29] = step2[29]; - step1[30] = step2[30]; - step1[31] = step2[31]; - - // final stage - output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd); - output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd); - output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd); - output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd); - output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd); - output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd); - output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd); - output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd); - output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd); - output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd); - output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd); - output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd); - output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd); - output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd); - output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd); - output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd); - output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd); - output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd); - output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd); - output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd); - output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd); - output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd); - output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd); - output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd); - output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd); - output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd); - output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd); - output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd); - output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd); - output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd); - output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd); - output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd); -} - -void av1_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[32 * 32]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[32], temp_out[32]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // Rows - for (i = 0; i < 32; ++i) { - tran_low_t zero_coeff[16]; - for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1]; - for (j = 0; j < 8; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 4; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 2; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - - if (zero_coeff[0] | zero_coeff[1]) - highbd_idct32_c(input, outptr, bd); - else - memset(outptr, 0, sizeof(tran_low_t) * 32); - input += 32; - outptr += 32; - } - - // Columns - for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - highbd_idct32_c(temp_in, temp_out, bd); - for (j = 0; j < 32; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } -} - -void av1_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[32 * 32] = { 0 }; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[32], temp_out[32]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // Rows - // Only upper-left 8x8 has non-zero coeff. - for (i = 0; i < 8; ++i) { - highbd_idct32_c(input, outptr, bd); - input += 32; - outptr += 32; - } - // Columns - for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - highbd_idct32_c(temp_in, temp_out, bd); - for (j = 0; j < 32; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } -} - -void av1_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - int i, j; - int a1; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - tran_low_t out = - HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); - out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); - a1 = ROUND_POWER_OF_TWO(out, 6); - - for (j = 0; j < 32; ++j) { - for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); - dest += stride; - } -} -#endif // CONFIG_AOM_HIGHBITDEPTH
diff --git a/av1/common/av1_inv_txfm.h b/av1/common/av1_inv_txfm.h deleted file mode 100644 index c57e888..0000000 --- a/av1/common/av1_inv_txfm.h +++ /dev/null
@@ -1,133 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_INV_TXFM_H_ -#define AOM_DSP_INV_TXFM_H_ - -#include <assert.h> - -#include "./aom_config.h" -#include "aom_dsp/txfm_common.h" -#include "aom_dsp/inv_txfm.h" -#include "aom_ports/mem.h" - -#ifdef __cplusplus -extern "C" { -#endif - -static INLINE tran_high_t check_range(tran_high_t input) { -#if CONFIG_COEFFICIENT_RANGE_CHECKING - // For valid input streams, intermediate stage coefficients should always - // stay within the range of a signed 16 bit integer. Coefficients can go out - // of this range for invalid/corrupt streams. However, strictly checking - // this range for every intermediate coefficient can burdensome for a decoder, - // therefore the following assertion is only enabled when configured with - // --enable-coefficient-range-checking. - assert(INT16_MIN <= input); - assert(input <= INT16_MAX); -#endif // CONFIG_COEFFICIENT_RANGE_CHECKING - return input; -} - -static INLINE tran_high_t dct_const_round_shift(tran_high_t input) { - tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - return rv; -} - -#if CONFIG_AOM_HIGHBITDEPTH -static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) { -#if CONFIG_COEFFICIENT_RANGE_CHECKING - // For valid highbitdepth streams, intermediate stage coefficients will - // stay within the ranges: - // - 8 bit: signed 16 bit integer - // - 10 bit: signed 18 bit integer - // - 12 bit: signed 20 bit integer - const int32_t int_max = (1 << (7 + bd)) - 1; - const int32_t int_min = -int_max - 1; - assert(int_min <= input); - assert(input <= int_max); - (void)int_min; -#endif // CONFIG_COEFFICIENT_RANGE_CHECKING - (void)bd; - return input; -} - -static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) { - tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - return rv; -} -#endif // CONFIG_AOM_HIGHBITDEPTH - -#if CONFIG_EMULATE_HARDWARE -// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a -// non-normative method to handle overflows. A stream that causes -// overflows in the inverse transform is considered invalid, -// and a hardware implementer is free to choose any reasonable -// method to handle overflows. However to aid in hardware -// verification they can use a specific implementation of the -// WRAPLOW() macro below that is identical to their intended -// hardware implementation (and also use configure options to trigger -// the C-implementation of the transform). -// -// The particular WRAPLOW implementation below performs strict -// overflow wrapping to match common hardware implementations. -// bd of 8 uses trans_low with 16bits, need to remove 16bits -// bd of 10 uses trans_low with 18bits, need to remove 14bits -// bd of 12 uses trans_low with 20bits, need to remove 12bits -// bd of x uses trans_low with 8+x bits, need to remove 24-x bits - -#define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16) -#if CONFIG_AOM_HIGHBITDEPTH -#define HIGHBD_WRAPLOW(x, bd) \ - ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd)) -#endif // CONFIG_AOM_HIGHBITDEPTH - -#else // CONFIG_EMULATE_HARDWARE - -#define WRAPLOW(x) ((int32_t)check_range(x)) -#if CONFIG_AOM_HIGHBITDEPTH -#define HIGHBD_WRAPLOW(x, bd) ((int32_t)highbd_check_range((x), bd)) -#endif // CONFIG_AOM_HIGHBITDEPTH - -#endif // CONFIG_EMULATE_HARDWARE - -void av1_idct4_c(const tran_low_t *input, tran_low_t *output); -void av1_idct8_c(const tran_low_t *input, tran_low_t *output); -void av1_idct16_c(const tran_low_t *input, tran_low_t *output); -void av1_idct32_c(const tran_low_t *input, tran_low_t *output); -void av1_iadst4_c(const tran_low_t *input, tran_low_t *output); -void av1_iadst8_c(const tran_low_t *input, tran_low_t *output); -void av1_iadst16_c(const tran_low_t *input, tran_low_t *output); - -#if CONFIG_AOM_HIGHBITDEPTH -void av1_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd); -void av1_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd); -void av1_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd); - -void av1_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd); -void av1_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd); -void av1_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd); - -static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans, - int bd) { - trans = HIGHBD_WRAPLOW(trans, bd); - return clip_pixel_highbd(dest + (int)trans, bd); -} -#endif - -static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) { - trans = WRAPLOW(trans); - return clip_pixel(dest + (int)trans); -} -#ifdef __cplusplus -} // extern "C" -#endif -#endif // AOM_DSP_INV_TXFM_H_
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index e66826f..1181b66 100644 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -414,62 +414,6 @@ specialize qw/av1_fht32x16 sse2/; } -if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") { - add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct4x4/; - - add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct4x4_1/; - - add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct8x8/; - - add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct8x8_1/; - - add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct16x16/; - - add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct16x16_1/; - - add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct32x32/; - - add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct32x32_rd/; - - add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct32x32_1/; -} else { - add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct4x4 sse2/; - - add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct4x4_1 sse2/; - - add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct8x8 sse2/; - - add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct8x8_1 sse2/; - - add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct16x16 sse2/; - - add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct16x16_1 sse2/; - - add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct32x32 sse2/; - - add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct32x32_rd sse2/; - - add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct32x32_1 sse2/; -} - if (aom_config("CONFIG_AOM_HIGHBITDEPTH") ne "yes") { if (aom_config("CONFIG_EXT_TX") ne "yes") { specialize qw/av1_fht4x4 msa/; @@ -478,243 +422,9 @@ } } -if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { - if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") { - add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_highbd_fdct4x4/; - - add_proto qw/void av1_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_highbd_fdct8x8/; - - add_proto qw/void av1_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_highbd_fdct8x8_1/; - - add_proto qw/void av1_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_highbd_fdct16x16/; - - add_proto qw/void av1_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_highbd_fdct16x16_1/; - - add_proto qw/void av1_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_highbd_fdct32x32/; - - add_proto qw/void av1_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_highbd_fdct32x32_rd/; - - add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_highbd_fdct32x32_1/; - } else { - add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_highbd_fdct4x4 sse2/; - - add_proto qw/void av1_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_highbd_fdct8x8 sse2/; - - add_proto qw/void av1_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_highbd_fdct8x8_1/; - - add_proto qw/void av1_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_highbd_fdct16x16 sse2/; - - add_proto qw/void av1_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_highbd_fdct16x16_1/; - - add_proto qw/void av1_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_highbd_fdct32x32 sse2/; - - add_proto qw/void av1_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_highbd_fdct32x32_rd sse2/; - - add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_highbd_fdct32x32_1/; - } -} - add_proto qw/void av1_fwd_idtx/, "const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type"; specialize qw/av1_fwd_idtx/; -# Inverse transform -if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { - # Note as optimized versions of these functions are added we need to add a check to ensure - # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. - add_proto qw/void av1_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct4x4_1_add/; - - add_proto qw/void av1_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct4x4_16_add/; - - add_proto qw/void av1_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct8x8_1_add/; - - add_proto qw/void av1_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct8x8_64_add/; - - add_proto qw/void av1_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct8x8_12_add/; - - add_proto qw/void av1_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct16x16_1_add/; - - add_proto qw/void av1_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct16x16_256_add/; - - add_proto qw/void av1_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct16x16_10_add/; - - add_proto qw/void av1_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct32x32_1024_add/; - - add_proto qw/void av1_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct32x32_34_add/; - - add_proto qw/void av1_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct32x32_1_add/; - - add_proto qw/void av1_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_iwht4x4_1_add/; - - add_proto qw/void av1_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_iwht4x4_16_add/; - - add_proto qw/void av1_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/av1_highbd_idct4x4_1_add/; - - add_proto qw/void av1_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/av1_highbd_idct8x8_1_add/; - - add_proto qw/void av1_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/av1_highbd_idct16x16_1_add/; - - add_proto qw/void av1_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/av1_highbd_idct32x32_1024_add/; - - add_proto qw/void av1_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/av1_highbd_idct32x32_34_add/; - - add_proto qw/void av1_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/av1_highbd_idct32x32_1_add/; - - add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/av1_highbd_iwht4x4_1_add/; - - add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/av1_highbd_iwht4x4_16_add/; - - # Force C versions if CONFIG_EMULATE_HARDWARE is 1 - if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") { - add_proto qw/void av1_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/av1_highbd_idct4x4_16_add/; - - add_proto qw/void av1_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/av1_highbd_idct8x8_64_add/; - - add_proto qw/void av1_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/av1_highbd_idct8x8_10_add/; - - add_proto qw/void av1_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/av1_highbd_idct16x16_256_add/; - - add_proto qw/void av1_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/av1_highbd_idct16x16_10_add/; - } else { - add_proto qw/void av1_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/av1_highbd_idct4x4_16_add sse2/; - - add_proto qw/void av1_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/av1_highbd_idct8x8_64_add sse2/; - - add_proto qw/void av1_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/av1_highbd_idct8x8_10_add sse2/; - - add_proto qw/void av1_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/av1_highbd_idct16x16_256_add sse2/; - - add_proto qw/void av1_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/av1_highbd_idct16x16_10_add sse2/; - } # CONFIG_EMULATE_HARDWARE -} else { - # Force C versions if CONFIG_EMULATE_HARDWARE is 1 - if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") { - add_proto qw/void av1_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct4x4_1_add/; - - add_proto qw/void av1_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct4x4_16_add/; - - add_proto qw/void av1_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct8x8_1_add/; - - add_proto qw/void av1_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct8x8_64_add/; - - add_proto qw/void av1_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct8x8_12_add/; - - add_proto qw/void av1_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct16x16_1_add/; - - add_proto qw/void av1_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct16x16_256_add/; - - add_proto qw/void av1_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct16x16_10_add/; - - add_proto qw/void av1_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct32x32_1024_add/; - - add_proto qw/void av1_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct32x32_34_add/; - - add_proto qw/void av1_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct32x32_1_add/; - - add_proto qw/void av1_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_iwht4x4_1_add/; - - add_proto qw/void av1_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_iwht4x4_16_add/; - } else { - add_proto qw/void av1_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct4x4_1_add sse2/; - - add_proto qw/void av1_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct4x4_16_add sse2/; - - add_proto qw/void av1_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct8x8_1_add sse2/; - - add_proto qw/void av1_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct8x8_64_add sse2/; - - add_proto qw/void av1_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct8x8_12_add sse2/; - - add_proto qw/void av1_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct16x16_1_add sse2/; - - add_proto qw/void av1_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct16x16_256_add sse2/; - - add_proto qw/void av1_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct16x16_10_add sse2/; - - add_proto qw/void av1_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct32x32_1024_add sse2/; - - add_proto qw/void av1_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct32x32_34_add sse2/; - - add_proto qw/void av1_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_idct32x32_1_add sse2/; - - add_proto qw/void av1_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_iwht4x4_1_add/; - - add_proto qw/void av1_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/av1_iwht4x4_16_add/; - } # CONFIG_EMULATE_HARDWARE -} # CONFIG_AOM_HIGHBITDEPTH - if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { #fwd txfm add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
diff --git a/av1/common/x86/av1_fwd_dct32x32_impl_sse2.h b/av1/common/x86/av1_fwd_dct32x32_impl_sse2.h deleted file mode 100644 index 876e579..0000000 --- a/av1/common/x86/av1_fwd_dct32x32_impl_sse2.h +++ /dev/null
@@ -1,3202 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> // SSE2 - -#include "./av1_rtcd.h" -#include "av1/common/av1_fwd_txfm.h" -#include "aom_dsp/txfm_common.h" -#include "aom_dsp/x86/txfm_common_sse2.h" - -// TODO(jingning) The high bit-depth version needs re-work for performance. -// The current SSE2 implementation also causes cross reference to the static -// functions in the C implementation file. -#if DCT_HIGH_BIT_DEPTH -#define ADD_EPI16 _mm_adds_epi16 -#define SUB_EPI16 _mm_subs_epi16 -#if FDCT32x32_HIGH_PRECISION -void av1_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) { - int i, j; - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i]; - av1_fdct32(temp_in, temp_out, 0); - for (j = 0; j < 32; ++j) - out[j + i * 32] = - (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); - } -} -#define HIGH_FDCT32x32_2D_C av1_highbd_fdct32x32_c -#define HIGH_FDCT32x32_2D_ROWS_C av1_fdct32x32_rows_c -#else -void av1_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) { - int i, j; - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i]; - av1_fdct32(temp_in, temp_out, 1); - for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j]; - } -} -#define HIGH_FDCT32x32_2D_C av1_highbd_fdct32x32_rd_c -#define HIGH_FDCT32x32_2D_ROWS_C av1_fdct32x32_rd_rows_c -#endif // FDCT32x32_HIGH_PRECISION -#else -#define ADD_EPI16 _mm_add_epi16 -#define SUB_EPI16 _mm_sub_epi16 -#endif // DCT_HIGH_BIT_DEPTH - -void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) { - // Calculate pre-multiplied strides - const int str1 = stride; - const int str2 = 2 * stride; - const int str3 = 2 * stride + str1; - // We need an intermediate buffer between passes. - DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]); - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); - const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64); - const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64); - const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64); - const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64); - const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); - const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); - const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); - const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); - const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64); - const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64); - const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64); - const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64); - const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64); - const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64); - const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64); - const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64); - const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64); - const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64); - const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64); - const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64); - const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64); - const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64); - const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); - const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); - const __m128i kOne = _mm_set1_epi16(1); - // Do the two transform/transpose passes - int pass; -#if DCT_HIGH_BIT_DEPTH - int overflow; -#endif - for (pass = 0; pass < 2; ++pass) { - // We process eight columns (transposed rows in second pass) at a time. - int column_start; - for (column_start = 0; column_start < 32; column_start += 8) { - __m128i step1[32]; - __m128i step2[32]; - __m128i step3[32]; - __m128i out[32]; - // Stage 1 - // Note: even though all the loads below are aligned, using the aligned - // intrinsic make the code slightly slower. - if (0 == pass) { - const int16_t *in = &input[column_start]; - // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; - // Note: the next four blocks could be in a loop. That would help the - // instruction cache but is actually slower. - { - const int16_t *ina = in + 0 * str1; - const int16_t *inb = in + 31 * str1; - __m128i *step1a = &step1[0]; - __m128i *step1b = &step1[31]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[0] = _mm_slli_epi16(step1a[0], 2); - step1a[1] = _mm_slli_epi16(step1a[1], 2); - step1a[2] = _mm_slli_epi16(step1a[2], 2); - step1a[3] = _mm_slli_epi16(step1a[3], 2); - step1b[-3] = _mm_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm_slli_epi16(step1b[-0], 2); - } - { - const int16_t *ina = in + 4 * str1; - const int16_t *inb = in + 27 * str1; - __m128i *step1a = &step1[4]; - __m128i *step1b = &step1[27]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[0] = _mm_slli_epi16(step1a[0], 2); - step1a[1] = _mm_slli_epi16(step1a[1], 2); - step1a[2] = _mm_slli_epi16(step1a[2], 2); - step1a[3] = _mm_slli_epi16(step1a[3], 2); - step1b[-3] = _mm_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm_slli_epi16(step1b[-0], 2); - } - { - const int16_t *ina = in + 8 * str1; - const int16_t *inb = in + 23 * str1; - __m128i *step1a = &step1[8]; - __m128i *step1b = &step1[23]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[0] = _mm_slli_epi16(step1a[0], 2); - step1a[1] = _mm_slli_epi16(step1a[1], 2); - step1a[2] = _mm_slli_epi16(step1a[2], 2); - step1a[3] = _mm_slli_epi16(step1a[3], 2); - step1b[-3] = _mm_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm_slli_epi16(step1b[-0], 2); - } - { - const int16_t *ina = in + 12 * str1; - const int16_t *inb = in + 19 * str1; - __m128i *step1a = &step1[12]; - __m128i *step1b = &step1[19]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[0] = _mm_slli_epi16(step1a[0], 2); - step1a[1] = _mm_slli_epi16(step1a[1], 2); - step1a[2] = _mm_slli_epi16(step1a[2], 2); - step1a[3] = _mm_slli_epi16(step1a[3], 2); - step1b[-3] = _mm_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm_slli_epi16(step1b[-0], 2); - } - } else { - int16_t *in = &intermediate[column_start]; - // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32]; - // Note: using the same approach as above to have common offset is - // counter-productive as all offsets can be calculated at compile - // time. - // Note: the next four blocks could be in a loop. That would help the - // instruction cache but is actually slower. - { - __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32)); - __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32)); - __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32)); - __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32)); - __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32)); - __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32)); - __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32)); - __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32)); - step1[0] = ADD_EPI16(in00, in31); - step1[1] = ADD_EPI16(in01, in30); - step1[2] = ADD_EPI16(in02, in29); - step1[3] = ADD_EPI16(in03, in28); - step1[28] = SUB_EPI16(in03, in28); - step1[29] = SUB_EPI16(in02, in29); - step1[30] = SUB_EPI16(in01, in30); - step1[31] = SUB_EPI16(in00, in31); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2], - &step1[3], &step1[28], &step1[29], - &step1[30], &step1[31]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32)); - __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32)); - __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32)); - __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32)); - __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32)); - __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32)); - __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32)); - __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32)); - step1[4] = ADD_EPI16(in04, in27); - step1[5] = ADD_EPI16(in05, in26); - step1[6] = ADD_EPI16(in06, in25); - step1[7] = ADD_EPI16(in07, in24); - step1[24] = SUB_EPI16(in07, in24); - step1[25] = SUB_EPI16(in06, in25); - step1[26] = SUB_EPI16(in05, in26); - step1[27] = SUB_EPI16(in04, in27); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6], - &step1[7], &step1[24], &step1[25], - &step1[26], &step1[27]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32)); - __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32)); - __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32)); - __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32)); - __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32)); - __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32)); - __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32)); - __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32)); - step1[8] = ADD_EPI16(in08, in23); - step1[9] = ADD_EPI16(in09, in22); - step1[10] = ADD_EPI16(in10, in21); - step1[11] = ADD_EPI16(in11, in20); - step1[20] = SUB_EPI16(in11, in20); - step1[21] = SUB_EPI16(in10, in21); - step1[22] = SUB_EPI16(in09, in22); - step1[23] = SUB_EPI16(in08, in23); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10], - &step1[11], &step1[20], &step1[21], - &step1[22], &step1[23]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32)); - __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32)); - __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32)); - __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32)); - __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32)); - __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32)); - __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32)); - __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32)); - step1[12] = ADD_EPI16(in12, in19); - step1[13] = ADD_EPI16(in13, in18); - step1[14] = ADD_EPI16(in14, in17); - step1[15] = ADD_EPI16(in15, in16); - step1[16] = SUB_EPI16(in15, in16); - step1[17] = SUB_EPI16(in14, in17); - step1[18] = SUB_EPI16(in13, in18); - step1[19] = SUB_EPI16(in12, in19); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14], - &step1[15], &step1[16], &step1[17], - &step1[18], &step1[19]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - } - // Stage 2 - { - step2[0] = ADD_EPI16(step1[0], step1[15]); - step2[1] = ADD_EPI16(step1[1], step1[14]); - step2[2] = ADD_EPI16(step1[2], step1[13]); - step2[3] = ADD_EPI16(step1[3], step1[12]); - step2[4] = ADD_EPI16(step1[4], step1[11]); - step2[5] = ADD_EPI16(step1[5], step1[10]); - step2[6] = ADD_EPI16(step1[6], step1[9]); - step2[7] = ADD_EPI16(step1[7], step1[8]); - step2[8] = SUB_EPI16(step1[7], step1[8]); - step2[9] = SUB_EPI16(step1[6], step1[9]); - step2[10] = SUB_EPI16(step1[5], step1[10]); - step2[11] = SUB_EPI16(step1[4], step1[11]); - step2[12] = SUB_EPI16(step1[3], step1[12]); - step2[13] = SUB_EPI16(step1[2], step1[13]); - step2[14] = SUB_EPI16(step1[1], step1[14]); - step2[15] = SUB_EPI16(step1[0], step1[15]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x16( - &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5], - &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11], - &step2[12], &step2[13], &step2[14], &step2[15]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]); - const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]); - const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]); - const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]); - const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]); - const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]); - const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]); - const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]); - const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16); - const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16); - const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16); - const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16); - const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16); - const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16); - const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16); - const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16); - const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16); - const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16); - const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16); - const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16); - const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16); - const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16); - const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16); - const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); - const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); - const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); - const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); - const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); - const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); - const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); - const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); - const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); - const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); - const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); - const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); - const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); - const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); - const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); - const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); - const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS); - const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS); - const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS); - const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS); - const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS); - const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS); - const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS); - const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS); - const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS); - const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS); - const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS); - const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS); - const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS); - const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS); - const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS); - const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS); - // Combine - step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7); - step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7); - step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7); - step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7); - step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7); - step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7); - step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7); - step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22], - &step2[23], &step2[24], &step2[25], - &step2[26], &step2[27]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - -#if !FDCT32x32_HIGH_PRECISION - // dump the magnitude by half, hence the intermediate values are within - // the range of 16 bits. - if (1 == pass) { - __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero); - __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero); - __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero); - __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero); - __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero); - __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero); - __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero); - __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero); - __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero); - __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero); - __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero); - __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero); - __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero); - __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero); - __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero); - __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero); - __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero); - __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero); - __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero); - __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero); - __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero); - __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero); - __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero); - __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero); - __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero); - __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero); - __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero); - __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero); - __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero); - __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero); - __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero); - __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero); - - step2[0] = SUB_EPI16(step2[0], s3_00_0); - step2[1] = SUB_EPI16(step2[1], s3_01_0); - step2[2] = SUB_EPI16(step2[2], s3_02_0); - step2[3] = SUB_EPI16(step2[3], s3_03_0); - step2[4] = SUB_EPI16(step2[4], s3_04_0); - step2[5] = SUB_EPI16(step2[5], s3_05_0); - step2[6] = SUB_EPI16(step2[6], s3_06_0); - step2[7] = SUB_EPI16(step2[7], s3_07_0); - step2[8] = SUB_EPI16(step2[8], s2_08_0); - step2[9] = SUB_EPI16(step2[9], s2_09_0); - step2[10] = SUB_EPI16(step2[10], s3_10_0); - step2[11] = SUB_EPI16(step2[11], s3_11_0); - step2[12] = SUB_EPI16(step2[12], s3_12_0); - step2[13] = SUB_EPI16(step2[13], s3_13_0); - step2[14] = SUB_EPI16(step2[14], s2_14_0); - step2[15] = SUB_EPI16(step2[15], s2_15_0); - step1[16] = SUB_EPI16(step1[16], s3_16_0); - step1[17] = SUB_EPI16(step1[17], s3_17_0); - step1[18] = SUB_EPI16(step1[18], s3_18_0); - step1[19] = SUB_EPI16(step1[19], s3_19_0); - step2[20] = SUB_EPI16(step2[20], s3_20_0); - step2[21] = SUB_EPI16(step2[21], s3_21_0); - step2[22] = SUB_EPI16(step2[22], s3_22_0); - step2[23] = SUB_EPI16(step2[23], s3_23_0); - step2[24] = SUB_EPI16(step2[24], s3_24_0); - step2[25] = SUB_EPI16(step2[25], s3_25_0); - step2[26] = SUB_EPI16(step2[26], s3_26_0); - step2[27] = SUB_EPI16(step2[27], s3_27_0); - step1[28] = SUB_EPI16(step1[28], s3_28_0); - step1[29] = SUB_EPI16(step1[29], s3_29_0); - step1[30] = SUB_EPI16(step1[30], s3_30_0); - step1[31] = SUB_EPI16(step1[31], s3_31_0); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x32( - &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5], - &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11], - &step2[12], &step2[13], &step2[14], &step2[15], &step1[16], - &step1[17], &step1[18], &step1[19], &step2[20], &step2[21], - &step2[22], &step2[23], &step2[24], &step2[25], &step2[26], - &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - step2[0] = _mm_add_epi16(step2[0], kOne); - step2[1] = _mm_add_epi16(step2[1], kOne); - step2[2] = _mm_add_epi16(step2[2], kOne); - step2[3] = _mm_add_epi16(step2[3], kOne); - step2[4] = _mm_add_epi16(step2[4], kOne); - step2[5] = _mm_add_epi16(step2[5], kOne); - step2[6] = _mm_add_epi16(step2[6], kOne); - step2[7] = _mm_add_epi16(step2[7], kOne); - step2[8] = _mm_add_epi16(step2[8], kOne); - step2[9] = _mm_add_epi16(step2[9], kOne); - step2[10] = _mm_add_epi16(step2[10], kOne); - step2[11] = _mm_add_epi16(step2[11], kOne); - step2[12] = _mm_add_epi16(step2[12], kOne); - step2[13] = _mm_add_epi16(step2[13], kOne); - step2[14] = _mm_add_epi16(step2[14], kOne); - step2[15] = _mm_add_epi16(step2[15], kOne); - step1[16] = _mm_add_epi16(step1[16], kOne); - step1[17] = _mm_add_epi16(step1[17], kOne); - step1[18] = _mm_add_epi16(step1[18], kOne); - step1[19] = _mm_add_epi16(step1[19], kOne); - step2[20] = _mm_add_epi16(step2[20], kOne); - step2[21] = _mm_add_epi16(step2[21], kOne); - step2[22] = _mm_add_epi16(step2[22], kOne); - step2[23] = _mm_add_epi16(step2[23], kOne); - step2[24] = _mm_add_epi16(step2[24], kOne); - step2[25] = _mm_add_epi16(step2[25], kOne); - step2[26] = _mm_add_epi16(step2[26], kOne); - step2[27] = _mm_add_epi16(step2[27], kOne); - step1[28] = _mm_add_epi16(step1[28], kOne); - step1[29] = _mm_add_epi16(step1[29], kOne); - step1[30] = _mm_add_epi16(step1[30], kOne); - step1[31] = _mm_add_epi16(step1[31], kOne); - - step2[0] = _mm_srai_epi16(step2[0], 2); - step2[1] = _mm_srai_epi16(step2[1], 2); - step2[2] = _mm_srai_epi16(step2[2], 2); - step2[3] = _mm_srai_epi16(step2[3], 2); - step2[4] = _mm_srai_epi16(step2[4], 2); - step2[5] = _mm_srai_epi16(step2[5], 2); - step2[6] = _mm_srai_epi16(step2[6], 2); - step2[7] = _mm_srai_epi16(step2[7], 2); - step2[8] = _mm_srai_epi16(step2[8], 2); - step2[9] = _mm_srai_epi16(step2[9], 2); - step2[10] = _mm_srai_epi16(step2[10], 2); - step2[11] = _mm_srai_epi16(step2[11], 2); - step2[12] = _mm_srai_epi16(step2[12], 2); - step2[13] = _mm_srai_epi16(step2[13], 2); - step2[14] = _mm_srai_epi16(step2[14], 2); - step2[15] = _mm_srai_epi16(step2[15], 2); - step1[16] = _mm_srai_epi16(step1[16], 2); - step1[17] = _mm_srai_epi16(step1[17], 2); - step1[18] = _mm_srai_epi16(step1[18], 2); - step1[19] = _mm_srai_epi16(step1[19], 2); - step2[20] = _mm_srai_epi16(step2[20], 2); - step2[21] = _mm_srai_epi16(step2[21], 2); - step2[22] = _mm_srai_epi16(step2[22], 2); - step2[23] = _mm_srai_epi16(step2[23], 2); - step2[24] = _mm_srai_epi16(step2[24], 2); - step2[25] = _mm_srai_epi16(step2[25], 2); - step2[26] = _mm_srai_epi16(step2[26], 2); - step2[27] = _mm_srai_epi16(step2[27], 2); - step1[28] = _mm_srai_epi16(step1[28], 2); - step1[29] = _mm_srai_epi16(step1[29], 2); - step1[30] = _mm_srai_epi16(step1[30], 2); - step1[31] = _mm_srai_epi16(step1[31], 2); - } -#endif // !FDCT32x32_HIGH_PRECISION - -#if FDCT32x32_HIGH_PRECISION - if (pass == 0) { -#endif - // Stage 3 - { - step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]); - step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]); - step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]); - step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]); - step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]); - step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]); - step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]); - step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2], - &step3[3], &step3[4], &step3[5], - &step3[6], &step3[7]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); - const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); - const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); - const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); - const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); - const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); - const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); - const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); - const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); - const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); - const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); - const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); - const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); - const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); - const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); - const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); - const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); - const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); - const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); - const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); - const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); - const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); - const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); - const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); - const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); - const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); - const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); - // Combine - step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7); - step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7); - step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7); - step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12], - &step3[13]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - step3[16] = ADD_EPI16(step2[23], step1[16]); - step3[17] = ADD_EPI16(step2[22], step1[17]); - step3[18] = ADD_EPI16(step2[21], step1[18]); - step3[19] = ADD_EPI16(step2[20], step1[19]); - step3[20] = SUB_EPI16(step1[19], step2[20]); - step3[21] = SUB_EPI16(step1[18], step2[21]); - step3[22] = SUB_EPI16(step1[17], step2[22]); - step3[23] = SUB_EPI16(step1[16], step2[23]); - step3[24] = SUB_EPI16(step1[31], step2[24]); - step3[25] = SUB_EPI16(step1[30], step2[25]); - step3[26] = SUB_EPI16(step1[29], step2[26]); - step3[27] = SUB_EPI16(step1[28], step2[27]); - step3[28] = ADD_EPI16(step2[27], step1[28]); - step3[29] = ADD_EPI16(step2[26], step1[29]); - step3[30] = ADD_EPI16(step2[25], step1[30]); - step3[31] = ADD_EPI16(step2[24], step1[31]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x16( - &step3[16], &step3[17], &step3[18], &step3[19], &step3[20], - &step3[21], &step3[22], &step3[23], &step3[24], &step3[25], - &step3[26], &step3[27], &step3[28], &step3[29], &step3[30], - &step3[31]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - - // Stage 4 - { - step1[0] = ADD_EPI16(step3[3], step3[0]); - step1[1] = ADD_EPI16(step3[2], step3[1]); - step1[2] = SUB_EPI16(step3[1], step3[2]); - step1[3] = SUB_EPI16(step3[0], step3[3]); - step1[8] = ADD_EPI16(step3[11], step2[8]); - step1[9] = ADD_EPI16(step3[10], step2[9]); - step1[10] = SUB_EPI16(step2[9], step3[10]); - step1[11] = SUB_EPI16(step2[8], step3[11]); - step1[12] = SUB_EPI16(step2[15], step3[12]); - step1[13] = SUB_EPI16(step2[14], step3[13]); - step1[14] = ADD_EPI16(step3[13], step2[14]); - step1[15] = ADD_EPI16(step3[12], step2[15]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x16( - &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5], - &step1[6], &step1[7], &step1[8], &step1[9], &step1[10], - &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]); - const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]); - const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16); - const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16); - const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16); - const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); - const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); - const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); - const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); - const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS); - const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS); - const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS); - const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS); - // Combine - step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7); - step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(&step1[5], &step1[6]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]); - const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]); - const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]); - const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]); - const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]); - const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]); - const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]); - const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]); - const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24); - const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24); - const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24); - const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24); - const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08); - const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08); - const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08); - const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08); - const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24); - const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24); - const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24); - const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24); - const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08); - const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08); - const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08); - const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); - const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); - const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); - const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); - const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); - const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); - const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); - const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); - const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); - const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); - const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); - const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); - const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); - const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); - const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); - const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); - const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS); - const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS); - const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS); - const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS); - const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS); - const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS); - const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS); - const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS); - const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS); - const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS); - const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS); - const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS); - const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS); - const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS); - const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS); - const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS); - // Combine - step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7); - step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7); - step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7); - step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7); - step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7); - step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7); - step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7); - step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20], - &step1[21], &step1[26], &step1[27], - &step1[28], &step1[29]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // Stage 5 - { - step2[4] = ADD_EPI16(step1[5], step3[4]); - step2[5] = SUB_EPI16(step3[4], step1[5]); - step2[6] = SUB_EPI16(step3[7], step1[6]); - step2[7] = ADD_EPI16(step1[6], step3[7]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6], - &step2[7]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]); - const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]); - const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]); - const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]); - const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16); - const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16); - const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16); - const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16); - const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08); - const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08); - const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24); - const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i out_00_4 = - _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); - const __m128i out_00_5 = - _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); - const __m128i out_16_4 = - _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); - const __m128i out_16_5 = - _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); - const __m128i out_08_4 = - _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); - const __m128i out_08_5 = - _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); - const __m128i out_24_4 = - _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); - const __m128i out_24_5 = - _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); - const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS); - const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS); - const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS); - const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS); - const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS); - const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS); - const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS); - const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS); - // Combine - out[0] = _mm_packs_epi32(out_00_6, out_00_7); - out[16] = _mm_packs_epi32(out_16_6, out_16_7); - out[8] = _mm_packs_epi32(out_08_6, out_08_7); - out[24] = _mm_packs_epi32(out_24_6, out_24_7); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]); - const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]); - const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]); - const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]); - const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24); - const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24); - const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08); - const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08); - const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24); - const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24); - const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08); - const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); - const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); - const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); - const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); - const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); - const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); - const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); - const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); - const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS); - const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS); - const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS); - const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS); - const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS); - const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS); - const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS); - const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS); - // Combine - step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7); - step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7); - step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7); - step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13], - &step2[14]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - step2[16] = ADD_EPI16(step1[19], step3[16]); - step2[17] = ADD_EPI16(step1[18], step3[17]); - step2[18] = SUB_EPI16(step3[17], step1[18]); - step2[19] = SUB_EPI16(step3[16], step1[19]); - step2[20] = SUB_EPI16(step3[23], step1[20]); - step2[21] = SUB_EPI16(step3[22], step1[21]); - step2[22] = ADD_EPI16(step1[21], step3[22]); - step2[23] = ADD_EPI16(step1[20], step3[23]); - step2[24] = ADD_EPI16(step1[27], step3[24]); - step2[25] = ADD_EPI16(step1[26], step3[25]); - step2[26] = SUB_EPI16(step3[25], step1[26]); - step2[27] = SUB_EPI16(step3[24], step1[27]); - step2[28] = SUB_EPI16(step3[31], step1[28]); - step2[29] = SUB_EPI16(step3[30], step1[29]); - step2[30] = ADD_EPI16(step1[29], step3[30]); - step2[31] = ADD_EPI16(step1[28], step3[31]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x16( - &step2[16], &step2[17], &step2[18], &step2[19], &step2[20], - &step2[21], &step2[22], &step2[23], &step2[24], &step2[25], - &step2[26], &step2[27], &step2[28], &step2[29], &step2[30], - &step2[31]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // Stage 6 - { - const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]); - const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]); - const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]); - const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]); - const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]); - const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]); - const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]); - const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]); - const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04); - const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04); - const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20); - const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20); - const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12); - const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12); - const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28); - const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28); - // dct_const_round_shift - const __m128i out_04_4 = - _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); - const __m128i out_04_5 = - _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); - const __m128i out_20_4 = - _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); - const __m128i out_20_5 = - _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); - const __m128i out_12_4 = - _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); - const __m128i out_12_5 = - _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); - const __m128i out_28_4 = - _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); - const __m128i out_28_5 = - _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); - const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS); - const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS); - const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS); - const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS); - const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS); - const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS); - const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS); - const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS); - // Combine - out[4] = _mm_packs_epi32(out_04_6, out_04_7); - out[20] = _mm_packs_epi32(out_20_6, out_20_7); - out[12] = _mm_packs_epi32(out_12_6, out_12_7); - out[28] = _mm_packs_epi32(out_28_6, out_28_7); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - step3[8] = ADD_EPI16(step2[9], step1[8]); - step3[9] = SUB_EPI16(step1[8], step2[9]); - step3[10] = SUB_EPI16(step1[11], step2[10]); - step3[11] = ADD_EPI16(step2[10], step1[11]); - step3[12] = ADD_EPI16(step2[13], step1[12]); - step3[13] = SUB_EPI16(step1[12], step2[13]); - step3[14] = SUB_EPI16(step1[15], step2[14]); - step3[15] = ADD_EPI16(step2[14], step1[15]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10], - &step3[11], &step3[12], &step3[13], - &step3[14], &step3[15]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]); - const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]); - const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]); - const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]); - const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]); - const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]); - const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]); - const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]); - const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28); - const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28); - const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04); - const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04); - const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12); - const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12); - const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20); - const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20); - const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12); - const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12); - const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20); - const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20); - const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28); - const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28); - const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04); - const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04); - // dct_const_round_shift - const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); - const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); - const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); - const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); - const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); - const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); - const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); - const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); - const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS); - const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS); - const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS); - const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS); - const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS); - const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS); - const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS); - const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS); - const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); - const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); - const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); - const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); - const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); - const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); - const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); - const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); - const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS); - const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS); - const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS); - const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS); - const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS); - const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS); - const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS); - const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS); - // Combine - step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7); - step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7); - step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7); - step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7); - // Combine - step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7); - step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7); - step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7); - step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21], - &step3[22], &step3[25], &step3[26], - &step3[29], &step3[30]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // Stage 7 - { - const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]); - const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]); - const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]); - const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]); - const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]); - const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]); - const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]); - const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]); - const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02); - const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02); - const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18); - const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18); - const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10); - const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10); - const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26); - const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26); - const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06); - const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06); - const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22); - const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22); - const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14); - const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14); - const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30); - const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30); - // dct_const_round_shift - const __m128i out_02_4 = - _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); - const __m128i out_02_5 = - _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); - const __m128i out_18_4 = - _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); - const __m128i out_18_5 = - _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); - const __m128i out_10_4 = - _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); - const __m128i out_10_5 = - _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); - const __m128i out_26_4 = - _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); - const __m128i out_26_5 = - _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); - const __m128i out_06_4 = - _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); - const __m128i out_06_5 = - _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); - const __m128i out_22_4 = - _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); - const __m128i out_22_5 = - _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); - const __m128i out_14_4 = - _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); - const __m128i out_14_5 = - _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); - const __m128i out_30_4 = - _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); - const __m128i out_30_5 = - _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); - const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS); - const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS); - const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS); - const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS); - const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS); - const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS); - const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS); - const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS); - const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS); - const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS); - const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS); - const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS); - const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS); - const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS); - const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS); - const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS); - // Combine - out[2] = _mm_packs_epi32(out_02_6, out_02_7); - out[18] = _mm_packs_epi32(out_18_6, out_18_7); - out[10] = _mm_packs_epi32(out_10_6, out_10_7); - out[26] = _mm_packs_epi32(out_26_6, out_26_7); - out[6] = _mm_packs_epi32(out_06_6, out_06_7); - out[22] = _mm_packs_epi32(out_22_6, out_22_7); - out[14] = _mm_packs_epi32(out_14_6, out_14_7); - out[30] = _mm_packs_epi32(out_30_6, out_30_7); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26], - &out[6], &out[22], &out[14], &out[30]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - step1[16] = ADD_EPI16(step3[17], step2[16]); - step1[17] = SUB_EPI16(step2[16], step3[17]); - step1[18] = SUB_EPI16(step2[19], step3[18]); - step1[19] = ADD_EPI16(step3[18], step2[19]); - step1[20] = ADD_EPI16(step3[21], step2[20]); - step1[21] = SUB_EPI16(step2[20], step3[21]); - step1[22] = SUB_EPI16(step2[23], step3[22]); - step1[23] = ADD_EPI16(step3[22], step2[23]); - step1[24] = ADD_EPI16(step3[25], step2[24]); - step1[25] = SUB_EPI16(step2[24], step3[25]); - step1[26] = SUB_EPI16(step2[27], step3[26]); - step1[27] = ADD_EPI16(step3[26], step2[27]); - step1[28] = ADD_EPI16(step3[29], step2[28]); - step1[29] = SUB_EPI16(step2[28], step3[29]); - step1[30] = SUB_EPI16(step2[31], step3[30]); - step1[31] = ADD_EPI16(step3[30], step2[31]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x16( - &step1[16], &step1[17], &step1[18], &step1[19], &step1[20], - &step1[21], &step1[22], &step1[23], &step1[24], &step1[25], - &step1[26], &step1[27], &step1[28], &step1[29], &step1[30], - &step1[31]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // Final stage --- outputs indices are bit-reversed. - { - const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]); - const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]); - const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]); - const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]); - const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]); - const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]); - const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]); - const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]); - const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01); - const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01); - const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17); - const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17); - const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09); - const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09); - const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25); - const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25); - const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07); - const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07); - const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23); - const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23); - const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15); - const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15); - const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31); - const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31); - // dct_const_round_shift - const __m128i out_01_4 = - _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); - const __m128i out_01_5 = - _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); - const __m128i out_17_4 = - _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); - const __m128i out_17_5 = - _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); - const __m128i out_09_4 = - _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); - const __m128i out_09_5 = - _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); - const __m128i out_25_4 = - _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); - const __m128i out_25_5 = - _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); - const __m128i out_07_4 = - _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); - const __m128i out_07_5 = - _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); - const __m128i out_23_4 = - _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); - const __m128i out_23_5 = - _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); - const __m128i out_15_4 = - _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); - const __m128i out_15_5 = - _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); - const __m128i out_31_4 = - _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); - const __m128i out_31_5 = - _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); - const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS); - const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS); - const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS); - const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS); - const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS); - const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS); - const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS); - const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS); - const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS); - const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS); - const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS); - const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS); - const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS); - const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS); - const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS); - const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS); - // Combine - out[1] = _mm_packs_epi32(out_01_6, out_01_7); - out[17] = _mm_packs_epi32(out_17_6, out_17_7); - out[9] = _mm_packs_epi32(out_09_6, out_09_7); - out[25] = _mm_packs_epi32(out_25_6, out_25_7); - out[7] = _mm_packs_epi32(out_07_6, out_07_7); - out[23] = _mm_packs_epi32(out_23_6, out_23_7); - out[15] = _mm_packs_epi32(out_15_6, out_15_7); - out[31] = _mm_packs_epi32(out_31_6, out_31_7); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25], - &out[7], &out[23], &out[15], &out[31]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]); - const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]); - const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]); - const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]); - const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]); - const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]); - const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]); - const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]); - const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05); - const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05); - const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21); - const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21); - const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13); - const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13); - const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29); - const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29); - const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03); - const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03); - const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19); - const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19); - const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11); - const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11); - const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27); - const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27); - // dct_const_round_shift - const __m128i out_05_4 = - _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); - const __m128i out_05_5 = - _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); - const __m128i out_21_4 = - _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); - const __m128i out_21_5 = - _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); - const __m128i out_13_4 = - _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); - const __m128i out_13_5 = - _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); - const __m128i out_29_4 = - _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); - const __m128i out_29_5 = - _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); - const __m128i out_03_4 = - _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); - const __m128i out_03_5 = - _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); - const __m128i out_19_4 = - _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); - const __m128i out_19_5 = - _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); - const __m128i out_11_4 = - _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); - const __m128i out_11_5 = - _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); - const __m128i out_27_4 = - _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); - const __m128i out_27_5 = - _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); - const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS); - const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS); - const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS); - const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS); - const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS); - const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS); - const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS); - const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS); - const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS); - const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS); - const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS); - const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS); - const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS); - const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS); - const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS); - const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS); - // Combine - out[5] = _mm_packs_epi32(out_05_6, out_05_7); - out[21] = _mm_packs_epi32(out_21_6, out_21_7); - out[13] = _mm_packs_epi32(out_13_6, out_13_7); - out[29] = _mm_packs_epi32(out_29_6, out_29_7); - out[3] = _mm_packs_epi32(out_03_6, out_03_7); - out[19] = _mm_packs_epi32(out_19_6, out_19_7); - out[11] = _mm_packs_epi32(out_11_6, out_11_7); - out[27] = _mm_packs_epi32(out_27_6, out_27_7); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29], - &out[3], &out[19], &out[11], &out[27]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } -#if FDCT32x32_HIGH_PRECISION - } else { - __m128i lstep1[64], lstep2[64], lstep3[64]; - __m128i u[32], v[32], sign[16]; - const __m128i K32One = _mm_set_epi32(1, 1, 1, 1); - // start using 32-bit operations - // stage 3 - { - // expanding to 32-bit length priori to addition operations - lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero); - lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero); - lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero); - lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero); - lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero); - lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero); - lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero); - lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero); - lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero); - lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero); - lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero); - lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero); - lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero); - lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero); - lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero); - lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero); - lstep2[0] = _mm_madd_epi16(lstep2[0], kOne); - lstep2[1] = _mm_madd_epi16(lstep2[1], kOne); - lstep2[2] = _mm_madd_epi16(lstep2[2], kOne); - lstep2[3] = _mm_madd_epi16(lstep2[3], kOne); - lstep2[4] = _mm_madd_epi16(lstep2[4], kOne); - lstep2[5] = _mm_madd_epi16(lstep2[5], kOne); - lstep2[6] = _mm_madd_epi16(lstep2[6], kOne); - lstep2[7] = _mm_madd_epi16(lstep2[7], kOne); - lstep2[8] = _mm_madd_epi16(lstep2[8], kOne); - lstep2[9] = _mm_madd_epi16(lstep2[9], kOne); - lstep2[10] = _mm_madd_epi16(lstep2[10], kOne); - lstep2[11] = _mm_madd_epi16(lstep2[11], kOne); - lstep2[12] = _mm_madd_epi16(lstep2[12], kOne); - lstep2[13] = _mm_madd_epi16(lstep2[13], kOne); - lstep2[14] = _mm_madd_epi16(lstep2[14], kOne); - lstep2[15] = _mm_madd_epi16(lstep2[15], kOne); - - lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]); - lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]); - lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]); - lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]); - lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]); - lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]); - lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]); - lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]); - lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]); - lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]); - lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]); - lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]); - lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]); - lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]); - lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]); - lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]); - } - { - const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); - const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); - const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); - const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); - const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); - const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); - const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); - const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); - const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); - const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); - const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); - const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); - const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); - const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); - const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); - const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); - const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); - const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); - const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); - lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); - lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); - lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); - lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); - lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); - lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); - lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); - lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); - } - { - lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero); - lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero); - lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero); - lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero); - lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero); - lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero); - lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero); - lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero); - lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero); - lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero); - lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero); - lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero); - lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero); - lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero); - lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero); - lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero); - lstep2[40] = _mm_madd_epi16(lstep2[40], kOne); - lstep2[41] = _mm_madd_epi16(lstep2[41], kOne); - lstep2[42] = _mm_madd_epi16(lstep2[42], kOne); - lstep2[43] = _mm_madd_epi16(lstep2[43], kOne); - lstep2[44] = _mm_madd_epi16(lstep2[44], kOne); - lstep2[45] = _mm_madd_epi16(lstep2[45], kOne); - lstep2[46] = _mm_madd_epi16(lstep2[46], kOne); - lstep2[47] = _mm_madd_epi16(lstep2[47], kOne); - lstep2[48] = _mm_madd_epi16(lstep2[48], kOne); - lstep2[49] = _mm_madd_epi16(lstep2[49], kOne); - lstep2[50] = _mm_madd_epi16(lstep2[50], kOne); - lstep2[51] = _mm_madd_epi16(lstep2[51], kOne); - lstep2[52] = _mm_madd_epi16(lstep2[52], kOne); - lstep2[53] = _mm_madd_epi16(lstep2[53], kOne); - lstep2[54] = _mm_madd_epi16(lstep2[54], kOne); - lstep2[55] = _mm_madd_epi16(lstep2[55], kOne); - - lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero); - lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero); - lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero); - lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero); - lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero); - lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero); - lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero); - lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero); - lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero); - lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero); - lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero); - lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero); - lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero); - lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero); - lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero); - lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero); - lstep1[32] = _mm_madd_epi16(lstep1[32], kOne); - lstep1[33] = _mm_madd_epi16(lstep1[33], kOne); - lstep1[34] = _mm_madd_epi16(lstep1[34], kOne); - lstep1[35] = _mm_madd_epi16(lstep1[35], kOne); - lstep1[36] = _mm_madd_epi16(lstep1[36], kOne); - lstep1[37] = _mm_madd_epi16(lstep1[37], kOne); - lstep1[38] = _mm_madd_epi16(lstep1[38], kOne); - lstep1[39] = _mm_madd_epi16(lstep1[39], kOne); - lstep1[56] = _mm_madd_epi16(lstep1[56], kOne); - lstep1[57] = _mm_madd_epi16(lstep1[57], kOne); - lstep1[58] = _mm_madd_epi16(lstep1[58], kOne); - lstep1[59] = _mm_madd_epi16(lstep1[59], kOne); - lstep1[60] = _mm_madd_epi16(lstep1[60], kOne); - lstep1[61] = _mm_madd_epi16(lstep1[61], kOne); - lstep1[62] = _mm_madd_epi16(lstep1[62], kOne); - lstep1[63] = _mm_madd_epi16(lstep1[63], kOne); - - lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]); - lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]); - - lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]); - lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]); - lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]); - lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]); - lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]); - lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]); - lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]); - lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]); - lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]); - lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]); - lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]); - lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]); - lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]); - lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]); - lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]); - lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]); - lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]); - lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]); - lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]); - lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]); - lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]); - lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]); - lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]); - lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]); - lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]); - lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]); - lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]); - lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]); - lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]); - lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]); - } - - // stage 4 - { - // expanding to 32-bit length priori to addition operations - lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero); - lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero); - lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero); - lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero); - lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero); - lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero); - lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero); - lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero); - lstep2[16] = _mm_madd_epi16(lstep2[16], kOne); - lstep2[17] = _mm_madd_epi16(lstep2[17], kOne); - lstep2[18] = _mm_madd_epi16(lstep2[18], kOne); - lstep2[19] = _mm_madd_epi16(lstep2[19], kOne); - lstep2[28] = _mm_madd_epi16(lstep2[28], kOne); - lstep2[29] = _mm_madd_epi16(lstep2[29], kOne); - lstep2[30] = _mm_madd_epi16(lstep2[30], kOne); - lstep2[31] = _mm_madd_epi16(lstep2[31], kOne); - - lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]); - lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]); - lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]); - lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]); - lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]); - lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]); - lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]); - lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]); - lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]); - lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]); - lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]); - lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]); - lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]); - lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]); - lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]); - lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]); - lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]); - lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]); - lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]); - lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]); - lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]); - lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]); - lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]); - lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]); - } - { - // to be continued... - // - const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); - const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); - - u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]); - u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]); - u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]); - u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]); - - // TODO(jingning): manually inline k_madd_epi32_ to further hide - // instruction latency. - v[0] = k_madd_epi32(u[0], k32_p16_m16); - v[1] = k_madd_epi32(u[1], k32_p16_m16); - v[2] = k_madd_epi32(u[2], k32_p16_m16); - v[3] = k_madd_epi32(u[3], k32_p16_m16); - v[4] = k_madd_epi32(u[0], k32_p16_p16); - v[5] = k_madd_epi32(u[1], k32_p16_p16); - v[6] = k_madd_epi32(u[2], k32_p16_p16); - v[7] = k_madd_epi32(u[3], k32_p16_p16); -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4], - &v[5], &v[6], &v[7], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - - lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - } - { - const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); - const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); - const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); - - u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]); - u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]); - u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]); - u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]); - u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]); - u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]); - u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]); - u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]); - u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]); - u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]); - u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]); - u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]); - u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]); - u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]); - u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]); - u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]); - - v[0] = k_madd_epi32(u[0], k32_m08_p24); - v[1] = k_madd_epi32(u[1], k32_m08_p24); - v[2] = k_madd_epi32(u[2], k32_m08_p24); - v[3] = k_madd_epi32(u[3], k32_m08_p24); - v[4] = k_madd_epi32(u[4], k32_m08_p24); - v[5] = k_madd_epi32(u[5], k32_m08_p24); - v[6] = k_madd_epi32(u[6], k32_m08_p24); - v[7] = k_madd_epi32(u[7], k32_m08_p24); - v[8] = k_madd_epi32(u[8], k32_m24_m08); - v[9] = k_madd_epi32(u[9], k32_m24_m08); - v[10] = k_madd_epi32(u[10], k32_m24_m08); - v[11] = k_madd_epi32(u[11], k32_m24_m08); - v[12] = k_madd_epi32(u[12], k32_m24_m08); - v[13] = k_madd_epi32(u[13], k32_m24_m08); - v[14] = k_madd_epi32(u[14], k32_m24_m08); - v[15] = k_madd_epi32(u[15], k32_m24_m08); - v[16] = k_madd_epi32(u[12], k32_m08_p24); - v[17] = k_madd_epi32(u[13], k32_m08_p24); - v[18] = k_madd_epi32(u[14], k32_m08_p24); - v[19] = k_madd_epi32(u[15], k32_m08_p24); - v[20] = k_madd_epi32(u[8], k32_m08_p24); - v[21] = k_madd_epi32(u[9], k32_m08_p24); - v[22] = k_madd_epi32(u[10], k32_m08_p24); - v[23] = k_madd_epi32(u[11], k32_m08_p24); - v[24] = k_madd_epi32(u[4], k32_p24_p08); - v[25] = k_madd_epi32(u[5], k32_p24_p08); - v[26] = k_madd_epi32(u[6], k32_p24_p08); - v[27] = k_madd_epi32(u[7], k32_p24_p08); - v[28] = k_madd_epi32(u[0], k32_p24_p08); - v[29] = k_madd_epi32(u[1], k32_p24_p08); - v[30] = k_madd_epi32(u[2], k32_p24_p08); - v[31] = k_madd_epi32(u[3], k32_p24_p08); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], - &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], - &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - u[8] = k_packs_epi64(v[16], v[17]); - u[9] = k_packs_epi64(v[18], v[19]); - u[10] = k_packs_epi64(v[20], v[21]); - u[11] = k_packs_epi64(v[22], v[23]); - u[12] = k_packs_epi64(v[24], v[25]); - u[13] = k_packs_epi64(v[26], v[27]); - u[14] = k_packs_epi64(v[28], v[29]); - u[15] = k_packs_epi64(v[30], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - } - // stage 5 - { - lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]); - lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]); - lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]); - lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]); - lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]); - lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]); - lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]); - lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]); - } - { - const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); - const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); - const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); - const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); - - u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]); - u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]); - u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]); - u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]); - u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]); - u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]); - u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]); - u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]); - - // TODO(jingning): manually inline k_madd_epi32_ to further hide - // instruction latency. - v[0] = k_madd_epi32(u[0], k32_p16_p16); - v[1] = k_madd_epi32(u[1], k32_p16_p16); - v[2] = k_madd_epi32(u[2], k32_p16_p16); - v[3] = k_madd_epi32(u[3], k32_p16_p16); - v[4] = k_madd_epi32(u[0], k32_p16_m16); - v[5] = k_madd_epi32(u[1], k32_p16_m16); - v[6] = k_madd_epi32(u[2], k32_p16_m16); - v[7] = k_madd_epi32(u[3], k32_p16_m16); - v[8] = k_madd_epi32(u[4], k32_p24_p08); - v[9] = k_madd_epi32(u[5], k32_p24_p08); - v[10] = k_madd_epi32(u[6], k32_p24_p08); - v[11] = k_madd_epi32(u[7], k32_p24_p08); - v[12] = k_madd_epi32(u[4], k32_m08_p24); - v[13] = k_madd_epi32(u[5], k32_m08_p24); - v[14] = k_madd_epi32(u[6], k32_m08_p24); - v[15] = k_madd_epi32(u[7], k32_m08_p24); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_16( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - - sign[0] = _mm_cmplt_epi32(u[0], kZero); - sign[1] = _mm_cmplt_epi32(u[1], kZero); - sign[2] = _mm_cmplt_epi32(u[2], kZero); - sign[3] = _mm_cmplt_epi32(u[3], kZero); - sign[4] = _mm_cmplt_epi32(u[4], kZero); - sign[5] = _mm_cmplt_epi32(u[5], kZero); - sign[6] = _mm_cmplt_epi32(u[6], kZero); - sign[7] = _mm_cmplt_epi32(u[7], kZero); - - u[0] = _mm_sub_epi32(u[0], sign[0]); - u[1] = _mm_sub_epi32(u[1], sign[1]); - u[2] = _mm_sub_epi32(u[2], sign[2]); - u[3] = _mm_sub_epi32(u[3], sign[3]); - u[4] = _mm_sub_epi32(u[4], sign[4]); - u[5] = _mm_sub_epi32(u[5], sign[5]); - u[6] = _mm_sub_epi32(u[6], sign[6]); - u[7] = _mm_sub_epi32(u[7], sign[7]); - - u[0] = _mm_add_epi32(u[0], K32One); - u[1] = _mm_add_epi32(u[1], K32One); - u[2] = _mm_add_epi32(u[2], K32One); - u[3] = _mm_add_epi32(u[3], K32One); - u[4] = _mm_add_epi32(u[4], K32One); - u[5] = _mm_add_epi32(u[5], K32One); - u[6] = _mm_add_epi32(u[6], K32One); - u[7] = _mm_add_epi32(u[7], K32One); - - u[0] = _mm_srai_epi32(u[0], 2); - u[1] = _mm_srai_epi32(u[1], 2); - u[2] = _mm_srai_epi32(u[2], 2); - u[3] = _mm_srai_epi32(u[3], 2); - u[4] = _mm_srai_epi32(u[4], 2); - u[5] = _mm_srai_epi32(u[5], 2); - u[6] = _mm_srai_epi32(u[6], 2); - u[7] = _mm_srai_epi32(u[7], 2); - - // Combine - out[0] = _mm_packs_epi32(u[0], u[1]); - out[16] = _mm_packs_epi32(u[2], u[3]); - out[8] = _mm_packs_epi32(u[4], u[5]); - out[24] = _mm_packs_epi32(u[6], u[7]); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); - const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); - const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); - - u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]); - u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]); - u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]); - u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]); - u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]); - u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]); - u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]); - u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]); - - v[0] = k_madd_epi32(u[0], k32_m08_p24); - v[1] = k_madd_epi32(u[1], k32_m08_p24); - v[2] = k_madd_epi32(u[2], k32_m08_p24); - v[3] = k_madd_epi32(u[3], k32_m08_p24); - v[4] = k_madd_epi32(u[4], k32_m24_m08); - v[5] = k_madd_epi32(u[5], k32_m24_m08); - v[6] = k_madd_epi32(u[6], k32_m24_m08); - v[7] = k_madd_epi32(u[7], k32_m24_m08); - v[8] = k_madd_epi32(u[4], k32_m08_p24); - v[9] = k_madd_epi32(u[5], k32_m08_p24); - v[10] = k_madd_epi32(u[6], k32_m08_p24); - v[11] = k_madd_epi32(u[7], k32_m08_p24); - v[12] = k_madd_epi32(u[0], k32_p24_p08); - v[13] = k_madd_epi32(u[1], k32_p24_p08); - v[14] = k_madd_epi32(u[2], k32_p24_p08); - v[15] = k_madd_epi32(u[3], k32_p24_p08); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_16( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - - u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - } - { - lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]); - lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]); - lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]); - lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]); - lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]); - lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]); - lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]); - lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]); - lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]); - lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]); - lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]); - lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]); - lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]); - lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]); - lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]); - lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]); - lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]); - lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]); - lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]); - lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]); - lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]); - lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]); - lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]); - lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]); - lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]); - lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]); - lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]); - lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]); - lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]); - lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]); - lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]); - lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]); - } - // stage 6 - { - const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64); - const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64); - const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); - const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); - - u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]); - u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]); - u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]); - u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]); - u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]); - u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); - u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); - u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]); - u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]); - u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); - u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); - u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]); - u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]); - u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]); - u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]); - u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]); - - v[0] = k_madd_epi32(u[0], k32_p28_p04); - v[1] = k_madd_epi32(u[1], k32_p28_p04); - v[2] = k_madd_epi32(u[2], k32_p28_p04); - v[3] = k_madd_epi32(u[3], k32_p28_p04); - v[4] = k_madd_epi32(u[4], k32_p12_p20); - v[5] = k_madd_epi32(u[5], k32_p12_p20); - v[6] = k_madd_epi32(u[6], k32_p12_p20); - v[7] = k_madd_epi32(u[7], k32_p12_p20); - v[8] = k_madd_epi32(u[8], k32_m20_p12); - v[9] = k_madd_epi32(u[9], k32_m20_p12); - v[10] = k_madd_epi32(u[10], k32_m20_p12); - v[11] = k_madd_epi32(u[11], k32_m20_p12); - v[12] = k_madd_epi32(u[12], k32_m04_p28); - v[13] = k_madd_epi32(u[13], k32_m04_p28); - v[14] = k_madd_epi32(u[14], k32_m04_p28); - v[15] = k_madd_epi32(u[15], k32_m04_p28); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_16( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - - sign[0] = _mm_cmplt_epi32(u[0], kZero); - sign[1] = _mm_cmplt_epi32(u[1], kZero); - sign[2] = _mm_cmplt_epi32(u[2], kZero); - sign[3] = _mm_cmplt_epi32(u[3], kZero); - sign[4] = _mm_cmplt_epi32(u[4], kZero); - sign[5] = _mm_cmplt_epi32(u[5], kZero); - sign[6] = _mm_cmplt_epi32(u[6], kZero); - sign[7] = _mm_cmplt_epi32(u[7], kZero); - - u[0] = _mm_sub_epi32(u[0], sign[0]); - u[1] = _mm_sub_epi32(u[1], sign[1]); - u[2] = _mm_sub_epi32(u[2], sign[2]); - u[3] = _mm_sub_epi32(u[3], sign[3]); - u[4] = _mm_sub_epi32(u[4], sign[4]); - u[5] = _mm_sub_epi32(u[5], sign[5]); - u[6] = _mm_sub_epi32(u[6], sign[6]); - u[7] = _mm_sub_epi32(u[7], sign[7]); - - u[0] = _mm_add_epi32(u[0], K32One); - u[1] = _mm_add_epi32(u[1], K32One); - u[2] = _mm_add_epi32(u[2], K32One); - u[3] = _mm_add_epi32(u[3], K32One); - u[4] = _mm_add_epi32(u[4], K32One); - u[5] = _mm_add_epi32(u[5], K32One); - u[6] = _mm_add_epi32(u[6], K32One); - u[7] = _mm_add_epi32(u[7], K32One); - - u[0] = _mm_srai_epi32(u[0], 2); - u[1] = _mm_srai_epi32(u[1], 2); - u[2] = _mm_srai_epi32(u[2], 2); - u[3] = _mm_srai_epi32(u[3], 2); - u[4] = _mm_srai_epi32(u[4], 2); - u[5] = _mm_srai_epi32(u[5], 2); - u[6] = _mm_srai_epi32(u[6], 2); - u[7] = _mm_srai_epi32(u[7], 2); - - out[4] = _mm_packs_epi32(u[0], u[1]); - out[20] = _mm_packs_epi32(u[2], u[3]); - out[12] = _mm_packs_epi32(u[4], u[5]); - out[28] = _mm_packs_epi32(u[6], u[7]); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]); - lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]); - lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]); - lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]); - lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]); - lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]); - lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]); - lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]); - lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]); - lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]); - lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]); - lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]); - lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]); - lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]); - lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]); - lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]); - } - { - const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); - const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64); - const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); - const __m128i k32_m12_m20 = - pair_set_epi32(-cospi_12_64, -cospi_20_64); - const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64); - const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64); - - u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]); - u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]); - u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]); - u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]); - u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]); - u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]); - u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]); - u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]); - u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]); - u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]); - u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]); - u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]); - u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]); - u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]); - u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]); - u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]); - - v[0] = k_madd_epi32(u[0], k32_m04_p28); - v[1] = k_madd_epi32(u[1], k32_m04_p28); - v[2] = k_madd_epi32(u[2], k32_m04_p28); - v[3] = k_madd_epi32(u[3], k32_m04_p28); - v[4] = k_madd_epi32(u[4], k32_m28_m04); - v[5] = k_madd_epi32(u[5], k32_m28_m04); - v[6] = k_madd_epi32(u[6], k32_m28_m04); - v[7] = k_madd_epi32(u[7], k32_m28_m04); - v[8] = k_madd_epi32(u[8], k32_m20_p12); - v[9] = k_madd_epi32(u[9], k32_m20_p12); - v[10] = k_madd_epi32(u[10], k32_m20_p12); - v[11] = k_madd_epi32(u[11], k32_m20_p12); - v[12] = k_madd_epi32(u[12], k32_m12_m20); - v[13] = k_madd_epi32(u[13], k32_m12_m20); - v[14] = k_madd_epi32(u[14], k32_m12_m20); - v[15] = k_madd_epi32(u[15], k32_m12_m20); - v[16] = k_madd_epi32(u[12], k32_m20_p12); - v[17] = k_madd_epi32(u[13], k32_m20_p12); - v[18] = k_madd_epi32(u[14], k32_m20_p12); - v[19] = k_madd_epi32(u[15], k32_m20_p12); - v[20] = k_madd_epi32(u[8], k32_p12_p20); - v[21] = k_madd_epi32(u[9], k32_p12_p20); - v[22] = k_madd_epi32(u[10], k32_p12_p20); - v[23] = k_madd_epi32(u[11], k32_p12_p20); - v[24] = k_madd_epi32(u[4], k32_m04_p28); - v[25] = k_madd_epi32(u[5], k32_m04_p28); - v[26] = k_madd_epi32(u[6], k32_m04_p28); - v[27] = k_madd_epi32(u[7], k32_m04_p28); - v[28] = k_madd_epi32(u[0], k32_p28_p04); - v[29] = k_madd_epi32(u[1], k32_p28_p04); - v[30] = k_madd_epi32(u[2], k32_p28_p04); - v[31] = k_madd_epi32(u[3], k32_p28_p04); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], - &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], - &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - u[8] = k_packs_epi64(v[16], v[17]); - u[9] = k_packs_epi64(v[18], v[19]); - u[10] = k_packs_epi64(v[20], v[21]); - u[11] = k_packs_epi64(v[22], v[23]); - u[12] = k_packs_epi64(v[24], v[25]); - u[13] = k_packs_epi64(v[26], v[27]); - u[14] = k_packs_epi64(v[28], v[29]); - u[15] = k_packs_epi64(v[30], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - } - // stage 7 - { - const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64); - const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64); - const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64); - const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64); - const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64); - const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64); - const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64); - const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64); - - u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]); - u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]); - u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]); - u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]); - u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]); - u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]); - u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]); - u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]); - u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]); - u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]); - u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]); - u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]); - u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]); - u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]); - u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]); - u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]); - - v[0] = k_madd_epi32(u[0], k32_p30_p02); - v[1] = k_madd_epi32(u[1], k32_p30_p02); - v[2] = k_madd_epi32(u[2], k32_p30_p02); - v[3] = k_madd_epi32(u[3], k32_p30_p02); - v[4] = k_madd_epi32(u[4], k32_p14_p18); - v[5] = k_madd_epi32(u[5], k32_p14_p18); - v[6] = k_madd_epi32(u[6], k32_p14_p18); - v[7] = k_madd_epi32(u[7], k32_p14_p18); - v[8] = k_madd_epi32(u[8], k32_p22_p10); - v[9] = k_madd_epi32(u[9], k32_p22_p10); - v[10] = k_madd_epi32(u[10], k32_p22_p10); - v[11] = k_madd_epi32(u[11], k32_p22_p10); - v[12] = k_madd_epi32(u[12], k32_p06_p26); - v[13] = k_madd_epi32(u[13], k32_p06_p26); - v[14] = k_madd_epi32(u[14], k32_p06_p26); - v[15] = k_madd_epi32(u[15], k32_p06_p26); - v[16] = k_madd_epi32(u[12], k32_m26_p06); - v[17] = k_madd_epi32(u[13], k32_m26_p06); - v[18] = k_madd_epi32(u[14], k32_m26_p06); - v[19] = k_madd_epi32(u[15], k32_m26_p06); - v[20] = k_madd_epi32(u[8], k32_m10_p22); - v[21] = k_madd_epi32(u[9], k32_m10_p22); - v[22] = k_madd_epi32(u[10], k32_m10_p22); - v[23] = k_madd_epi32(u[11], k32_m10_p22); - v[24] = k_madd_epi32(u[4], k32_m18_p14); - v[25] = k_madd_epi32(u[5], k32_m18_p14); - v[26] = k_madd_epi32(u[6], k32_m18_p14); - v[27] = k_madd_epi32(u[7], k32_m18_p14); - v[28] = k_madd_epi32(u[0], k32_m02_p30); - v[29] = k_madd_epi32(u[1], k32_m02_p30); - v[30] = k_madd_epi32(u[2], k32_m02_p30); - v[31] = k_madd_epi32(u[3], k32_m02_p30); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], - &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], - &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - u[8] = k_packs_epi64(v[16], v[17]); - u[9] = k_packs_epi64(v[18], v[19]); - u[10] = k_packs_epi64(v[20], v[21]); - u[11] = k_packs_epi64(v[22], v[23]); - u[12] = k_packs_epi64(v[24], v[25]); - u[13] = k_packs_epi64(v[26], v[27]); - u[14] = k_packs_epi64(v[28], v[29]); - u[15] = k_packs_epi64(v[30], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - v[0] = _mm_cmplt_epi32(u[0], kZero); - v[1] = _mm_cmplt_epi32(u[1], kZero); - v[2] = _mm_cmplt_epi32(u[2], kZero); - v[3] = _mm_cmplt_epi32(u[3], kZero); - v[4] = _mm_cmplt_epi32(u[4], kZero); - v[5] = _mm_cmplt_epi32(u[5], kZero); - v[6] = _mm_cmplt_epi32(u[6], kZero); - v[7] = _mm_cmplt_epi32(u[7], kZero); - v[8] = _mm_cmplt_epi32(u[8], kZero); - v[9] = _mm_cmplt_epi32(u[9], kZero); - v[10] = _mm_cmplt_epi32(u[10], kZero); - v[11] = _mm_cmplt_epi32(u[11], kZero); - v[12] = _mm_cmplt_epi32(u[12], kZero); - v[13] = _mm_cmplt_epi32(u[13], kZero); - v[14] = _mm_cmplt_epi32(u[14], kZero); - v[15] = _mm_cmplt_epi32(u[15], kZero); - - u[0] = _mm_sub_epi32(u[0], v[0]); - u[1] = _mm_sub_epi32(u[1], v[1]); - u[2] = _mm_sub_epi32(u[2], v[2]); - u[3] = _mm_sub_epi32(u[3], v[3]); - u[4] = _mm_sub_epi32(u[4], v[4]); - u[5] = _mm_sub_epi32(u[5], v[5]); - u[6] = _mm_sub_epi32(u[6], v[6]); - u[7] = _mm_sub_epi32(u[7], v[7]); - u[8] = _mm_sub_epi32(u[8], v[8]); - u[9] = _mm_sub_epi32(u[9], v[9]); - u[10] = _mm_sub_epi32(u[10], v[10]); - u[11] = _mm_sub_epi32(u[11], v[11]); - u[12] = _mm_sub_epi32(u[12], v[12]); - u[13] = _mm_sub_epi32(u[13], v[13]); - u[14] = _mm_sub_epi32(u[14], v[14]); - u[15] = _mm_sub_epi32(u[15], v[15]); - - v[0] = _mm_add_epi32(u[0], K32One); - v[1] = _mm_add_epi32(u[1], K32One); - v[2] = _mm_add_epi32(u[2], K32One); - v[3] = _mm_add_epi32(u[3], K32One); - v[4] = _mm_add_epi32(u[4], K32One); - v[5] = _mm_add_epi32(u[5], K32One); - v[6] = _mm_add_epi32(u[6], K32One); - v[7] = _mm_add_epi32(u[7], K32One); - v[8] = _mm_add_epi32(u[8], K32One); - v[9] = _mm_add_epi32(u[9], K32One); - v[10] = _mm_add_epi32(u[10], K32One); - v[11] = _mm_add_epi32(u[11], K32One); - v[12] = _mm_add_epi32(u[12], K32One); - v[13] = _mm_add_epi32(u[13], K32One); - v[14] = _mm_add_epi32(u[14], K32One); - v[15] = _mm_add_epi32(u[15], K32One); - - u[0] = _mm_srai_epi32(v[0], 2); - u[1] = _mm_srai_epi32(v[1], 2); - u[2] = _mm_srai_epi32(v[2], 2); - u[3] = _mm_srai_epi32(v[3], 2); - u[4] = _mm_srai_epi32(v[4], 2); - u[5] = _mm_srai_epi32(v[5], 2); - u[6] = _mm_srai_epi32(v[6], 2); - u[7] = _mm_srai_epi32(v[7], 2); - u[8] = _mm_srai_epi32(v[8], 2); - u[9] = _mm_srai_epi32(v[9], 2); - u[10] = _mm_srai_epi32(v[10], 2); - u[11] = _mm_srai_epi32(v[11], 2); - u[12] = _mm_srai_epi32(v[12], 2); - u[13] = _mm_srai_epi32(v[13], 2); - u[14] = _mm_srai_epi32(v[14], 2); - u[15] = _mm_srai_epi32(v[15], 2); - - out[2] = _mm_packs_epi32(u[0], u[1]); - out[18] = _mm_packs_epi32(u[2], u[3]); - out[10] = _mm_packs_epi32(u[4], u[5]); - out[26] = _mm_packs_epi32(u[6], u[7]); - out[6] = _mm_packs_epi32(u[8], u[9]); - out[22] = _mm_packs_epi32(u[10], u[11]); - out[14] = _mm_packs_epi32(u[12], u[13]); - out[30] = _mm_packs_epi32(u[14], u[15]); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26], - &out[6], &out[22], &out[14], &out[30]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]); - lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]); - lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]); - lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]); - lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]); - lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]); - lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]); - lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]); - lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]); - lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]); - lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]); - lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]); - lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]); - lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]); - lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]); - lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]); - lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]); - lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]); - lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]); - lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]); - lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]); - lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]); - lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]); - lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]); - lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]); - lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]); - lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]); - lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]); - lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]); - lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]); - lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]); - lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]); - } - // stage 8 - { - const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64); - const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64); - const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64); - const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64); - const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64); - const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64); - const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64); - const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64); - - u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]); - u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]); - u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]); - u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]); - u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]); - u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]); - u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]); - u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]); - u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]); - u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]); - u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]); - u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]); - u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]); - u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]); - u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]); - u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]); - - v[0] = k_madd_epi32(u[0], k32_p31_p01); - v[1] = k_madd_epi32(u[1], k32_p31_p01); - v[2] = k_madd_epi32(u[2], k32_p31_p01); - v[3] = k_madd_epi32(u[3], k32_p31_p01); - v[4] = k_madd_epi32(u[4], k32_p15_p17); - v[5] = k_madd_epi32(u[5], k32_p15_p17); - v[6] = k_madd_epi32(u[6], k32_p15_p17); - v[7] = k_madd_epi32(u[7], k32_p15_p17); - v[8] = k_madd_epi32(u[8], k32_p23_p09); - v[9] = k_madd_epi32(u[9], k32_p23_p09); - v[10] = k_madd_epi32(u[10], k32_p23_p09); - v[11] = k_madd_epi32(u[11], k32_p23_p09); - v[12] = k_madd_epi32(u[12], k32_p07_p25); - v[13] = k_madd_epi32(u[13], k32_p07_p25); - v[14] = k_madd_epi32(u[14], k32_p07_p25); - v[15] = k_madd_epi32(u[15], k32_p07_p25); - v[16] = k_madd_epi32(u[12], k32_m25_p07); - v[17] = k_madd_epi32(u[13], k32_m25_p07); - v[18] = k_madd_epi32(u[14], k32_m25_p07); - v[19] = k_madd_epi32(u[15], k32_m25_p07); - v[20] = k_madd_epi32(u[8], k32_m09_p23); - v[21] = k_madd_epi32(u[9], k32_m09_p23); - v[22] = k_madd_epi32(u[10], k32_m09_p23); - v[23] = k_madd_epi32(u[11], k32_m09_p23); - v[24] = k_madd_epi32(u[4], k32_m17_p15); - v[25] = k_madd_epi32(u[5], k32_m17_p15); - v[26] = k_madd_epi32(u[6], k32_m17_p15); - v[27] = k_madd_epi32(u[7], k32_m17_p15); - v[28] = k_madd_epi32(u[0], k32_m01_p31); - v[29] = k_madd_epi32(u[1], k32_m01_p31); - v[30] = k_madd_epi32(u[2], k32_m01_p31); - v[31] = k_madd_epi32(u[3], k32_m01_p31); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], - &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], - &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - u[8] = k_packs_epi64(v[16], v[17]); - u[9] = k_packs_epi64(v[18], v[19]); - u[10] = k_packs_epi64(v[20], v[21]); - u[11] = k_packs_epi64(v[22], v[23]); - u[12] = k_packs_epi64(v[24], v[25]); - u[13] = k_packs_epi64(v[26], v[27]); - u[14] = k_packs_epi64(v[28], v[29]); - u[15] = k_packs_epi64(v[30], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - v[0] = _mm_cmplt_epi32(u[0], kZero); - v[1] = _mm_cmplt_epi32(u[1], kZero); - v[2] = _mm_cmplt_epi32(u[2], kZero); - v[3] = _mm_cmplt_epi32(u[3], kZero); - v[4] = _mm_cmplt_epi32(u[4], kZero); - v[5] = _mm_cmplt_epi32(u[5], kZero); - v[6] = _mm_cmplt_epi32(u[6], kZero); - v[7] = _mm_cmplt_epi32(u[7], kZero); - v[8] = _mm_cmplt_epi32(u[8], kZero); - v[9] = _mm_cmplt_epi32(u[9], kZero); - v[10] = _mm_cmplt_epi32(u[10], kZero); - v[11] = _mm_cmplt_epi32(u[11], kZero); - v[12] = _mm_cmplt_epi32(u[12], kZero); - v[13] = _mm_cmplt_epi32(u[13], kZero); - v[14] = _mm_cmplt_epi32(u[14], kZero); - v[15] = _mm_cmplt_epi32(u[15], kZero); - - u[0] = _mm_sub_epi32(u[0], v[0]); - u[1] = _mm_sub_epi32(u[1], v[1]); - u[2] = _mm_sub_epi32(u[2], v[2]); - u[3] = _mm_sub_epi32(u[3], v[3]); - u[4] = _mm_sub_epi32(u[4], v[4]); - u[5] = _mm_sub_epi32(u[5], v[5]); - u[6] = _mm_sub_epi32(u[6], v[6]); - u[7] = _mm_sub_epi32(u[7], v[7]); - u[8] = _mm_sub_epi32(u[8], v[8]); - u[9] = _mm_sub_epi32(u[9], v[9]); - u[10] = _mm_sub_epi32(u[10], v[10]); - u[11] = _mm_sub_epi32(u[11], v[11]); - u[12] = _mm_sub_epi32(u[12], v[12]); - u[13] = _mm_sub_epi32(u[13], v[13]); - u[14] = _mm_sub_epi32(u[14], v[14]); - u[15] = _mm_sub_epi32(u[15], v[15]); - - v[0] = _mm_add_epi32(u[0], K32One); - v[1] = _mm_add_epi32(u[1], K32One); - v[2] = _mm_add_epi32(u[2], K32One); - v[3] = _mm_add_epi32(u[3], K32One); - v[4] = _mm_add_epi32(u[4], K32One); - v[5] = _mm_add_epi32(u[5], K32One); - v[6] = _mm_add_epi32(u[6], K32One); - v[7] = _mm_add_epi32(u[7], K32One); - v[8] = _mm_add_epi32(u[8], K32One); - v[9] = _mm_add_epi32(u[9], K32One); - v[10] = _mm_add_epi32(u[10], K32One); - v[11] = _mm_add_epi32(u[11], K32One); - v[12] = _mm_add_epi32(u[12], K32One); - v[13] = _mm_add_epi32(u[13], K32One); - v[14] = _mm_add_epi32(u[14], K32One); - v[15] = _mm_add_epi32(u[15], K32One); - - u[0] = _mm_srai_epi32(v[0], 2); - u[1] = _mm_srai_epi32(v[1], 2); - u[2] = _mm_srai_epi32(v[2], 2); - u[3] = _mm_srai_epi32(v[3], 2); - u[4] = _mm_srai_epi32(v[4], 2); - u[5] = _mm_srai_epi32(v[5], 2); - u[6] = _mm_srai_epi32(v[6], 2); - u[7] = _mm_srai_epi32(v[7], 2); - u[8] = _mm_srai_epi32(v[8], 2); - u[9] = _mm_srai_epi32(v[9], 2); - u[10] = _mm_srai_epi32(v[10], 2); - u[11] = _mm_srai_epi32(v[11], 2); - u[12] = _mm_srai_epi32(v[12], 2); - u[13] = _mm_srai_epi32(v[13], 2); - u[14] = _mm_srai_epi32(v[14], 2); - u[15] = _mm_srai_epi32(v[15], 2); - - out[1] = _mm_packs_epi32(u[0], u[1]); - out[17] = _mm_packs_epi32(u[2], u[3]); - out[9] = _mm_packs_epi32(u[4], u[5]); - out[25] = _mm_packs_epi32(u[6], u[7]); - out[7] = _mm_packs_epi32(u[8], u[9]); - out[23] = _mm_packs_epi32(u[10], u[11]); - out[15] = _mm_packs_epi32(u[12], u[13]); - out[31] = _mm_packs_epi32(u[14], u[15]); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25], - &out[7], &out[23], &out[15], &out[31]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64); - const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64); - const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64); - const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64); - const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64); - const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64); - const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64); - const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64); - - u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]); - u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]); - u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]); - u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]); - u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]); - u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]); - u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]); - u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]); - u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]); - u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]); - u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]); - u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]); - u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]); - u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]); - u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]); - u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]); - - v[0] = k_madd_epi32(u[0], k32_p27_p05); - v[1] = k_madd_epi32(u[1], k32_p27_p05); - v[2] = k_madd_epi32(u[2], k32_p27_p05); - v[3] = k_madd_epi32(u[3], k32_p27_p05); - v[4] = k_madd_epi32(u[4], k32_p11_p21); - v[5] = k_madd_epi32(u[5], k32_p11_p21); - v[6] = k_madd_epi32(u[6], k32_p11_p21); - v[7] = k_madd_epi32(u[7], k32_p11_p21); - v[8] = k_madd_epi32(u[8], k32_p19_p13); - v[9] = k_madd_epi32(u[9], k32_p19_p13); - v[10] = k_madd_epi32(u[10], k32_p19_p13); - v[11] = k_madd_epi32(u[11], k32_p19_p13); - v[12] = k_madd_epi32(u[12], k32_p03_p29); - v[13] = k_madd_epi32(u[13], k32_p03_p29); - v[14] = k_madd_epi32(u[14], k32_p03_p29); - v[15] = k_madd_epi32(u[15], k32_p03_p29); - v[16] = k_madd_epi32(u[12], k32_m29_p03); - v[17] = k_madd_epi32(u[13], k32_m29_p03); - v[18] = k_madd_epi32(u[14], k32_m29_p03); - v[19] = k_madd_epi32(u[15], k32_m29_p03); - v[20] = k_madd_epi32(u[8], k32_m13_p19); - v[21] = k_madd_epi32(u[9], k32_m13_p19); - v[22] = k_madd_epi32(u[10], k32_m13_p19); - v[23] = k_madd_epi32(u[11], k32_m13_p19); - v[24] = k_madd_epi32(u[4], k32_m21_p11); - v[25] = k_madd_epi32(u[5], k32_m21_p11); - v[26] = k_madd_epi32(u[6], k32_m21_p11); - v[27] = k_madd_epi32(u[7], k32_m21_p11); - v[28] = k_madd_epi32(u[0], k32_m05_p27); - v[29] = k_madd_epi32(u[1], k32_m05_p27); - v[30] = k_madd_epi32(u[2], k32_m05_p27); - v[31] = k_madd_epi32(u[3], k32_m05_p27); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], - &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], - &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - u[8] = k_packs_epi64(v[16], v[17]); - u[9] = k_packs_epi64(v[18], v[19]); - u[10] = k_packs_epi64(v[20], v[21]); - u[11] = k_packs_epi64(v[22], v[23]); - u[12] = k_packs_epi64(v[24], v[25]); - u[13] = k_packs_epi64(v[26], v[27]); - u[14] = k_packs_epi64(v[28], v[29]); - u[15] = k_packs_epi64(v[30], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - v[0] = _mm_cmplt_epi32(u[0], kZero); - v[1] = _mm_cmplt_epi32(u[1], kZero); - v[2] = _mm_cmplt_epi32(u[2], kZero); - v[3] = _mm_cmplt_epi32(u[3], kZero); - v[4] = _mm_cmplt_epi32(u[4], kZero); - v[5] = _mm_cmplt_epi32(u[5], kZero); - v[6] = _mm_cmplt_epi32(u[6], kZero); - v[7] = _mm_cmplt_epi32(u[7], kZero); - v[8] = _mm_cmplt_epi32(u[8], kZero); - v[9] = _mm_cmplt_epi32(u[9], kZero); - v[10] = _mm_cmplt_epi32(u[10], kZero); - v[11] = _mm_cmplt_epi32(u[11], kZero); - v[12] = _mm_cmplt_epi32(u[12], kZero); - v[13] = _mm_cmplt_epi32(u[13], kZero); - v[14] = _mm_cmplt_epi32(u[14], kZero); - v[15] = _mm_cmplt_epi32(u[15], kZero); - - u[0] = _mm_sub_epi32(u[0], v[0]); - u[1] = _mm_sub_epi32(u[1], v[1]); - u[2] = _mm_sub_epi32(u[2], v[2]); - u[3] = _mm_sub_epi32(u[3], v[3]); - u[4] = _mm_sub_epi32(u[4], v[4]); - u[5] = _mm_sub_epi32(u[5], v[5]); - u[6] = _mm_sub_epi32(u[6], v[6]); - u[7] = _mm_sub_epi32(u[7], v[7]); - u[8] = _mm_sub_epi32(u[8], v[8]); - u[9] = _mm_sub_epi32(u[9], v[9]); - u[10] = _mm_sub_epi32(u[10], v[10]); - u[11] = _mm_sub_epi32(u[11], v[11]); - u[12] = _mm_sub_epi32(u[12], v[12]); - u[13] = _mm_sub_epi32(u[13], v[13]); - u[14] = _mm_sub_epi32(u[14], v[14]); - u[15] = _mm_sub_epi32(u[15], v[15]); - - v[0] = _mm_add_epi32(u[0], K32One); - v[1] = _mm_add_epi32(u[1], K32One); - v[2] = _mm_add_epi32(u[2], K32One); - v[3] = _mm_add_epi32(u[3], K32One); - v[4] = _mm_add_epi32(u[4], K32One); - v[5] = _mm_add_epi32(u[5], K32One); - v[6] = _mm_add_epi32(u[6], K32One); - v[7] = _mm_add_epi32(u[7], K32One); - v[8] = _mm_add_epi32(u[8], K32One); - v[9] = _mm_add_epi32(u[9], K32One); - v[10] = _mm_add_epi32(u[10], K32One); - v[11] = _mm_add_epi32(u[11], K32One); - v[12] = _mm_add_epi32(u[12], K32One); - v[13] = _mm_add_epi32(u[13], K32One); - v[14] = _mm_add_epi32(u[14], K32One); - v[15] = _mm_add_epi32(u[15], K32One); - - u[0] = _mm_srai_epi32(v[0], 2); - u[1] = _mm_srai_epi32(v[1], 2); - u[2] = _mm_srai_epi32(v[2], 2); - u[3] = _mm_srai_epi32(v[3], 2); - u[4] = _mm_srai_epi32(v[4], 2); - u[5] = _mm_srai_epi32(v[5], 2); - u[6] = _mm_srai_epi32(v[6], 2); - u[7] = _mm_srai_epi32(v[7], 2); - u[8] = _mm_srai_epi32(v[8], 2); - u[9] = _mm_srai_epi32(v[9], 2); - u[10] = _mm_srai_epi32(v[10], 2); - u[11] = _mm_srai_epi32(v[11], 2); - u[12] = _mm_srai_epi32(v[12], 2); - u[13] = _mm_srai_epi32(v[13], 2); - u[14] = _mm_srai_epi32(v[14], 2); - u[15] = _mm_srai_epi32(v[15], 2); - - out[5] = _mm_packs_epi32(u[0], u[1]); - out[21] = _mm_packs_epi32(u[2], u[3]); - out[13] = _mm_packs_epi32(u[4], u[5]); - out[29] = _mm_packs_epi32(u[6], u[7]); - out[3] = _mm_packs_epi32(u[8], u[9]); - out[19] = _mm_packs_epi32(u[10], u[11]); - out[11] = _mm_packs_epi32(u[12], u[13]); - out[27] = _mm_packs_epi32(u[14], u[15]); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29], - &out[3], &out[19], &out[11], &out[27]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - } -#endif // FDCT32x32_HIGH_PRECISION - // Transpose the results, do it as four 8x8 transposes. - { - int transpose_block; - int16_t *output0 = &intermediate[column_start * 32]; - tran_low_t *output1 = &output_org[column_start * 32]; - for (transpose_block = 0; transpose_block < 4; ++transpose_block) { - __m128i *this_out = &out[8 * transpose_block]; - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - if (0 == pass) { - // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2; - // TODO(cd): see quality impact of only doing - // output[j] = (output[j] + 1) >> 2; - // which would remove the code between here ... - __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero); - __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero); - __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero); - __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero); - __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero); - __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero); - __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero); - __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero); - tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0); - tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0); - tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0); - tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0); - tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0); - tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0); - tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0); - tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0); - // ... and here. - // PS: also change code in av1/encoder/dct.c - tr2_0 = _mm_add_epi16(tr2_0, kOne); - tr2_1 = _mm_add_epi16(tr2_1, kOne); - tr2_2 = _mm_add_epi16(tr2_2, kOne); - tr2_3 = _mm_add_epi16(tr2_3, kOne); - tr2_4 = _mm_add_epi16(tr2_4, kOne); - tr2_5 = _mm_add_epi16(tr2_5, kOne); - tr2_6 = _mm_add_epi16(tr2_6, kOne); - tr2_7 = _mm_add_epi16(tr2_7, kOne); - tr2_0 = _mm_srai_epi16(tr2_0, 2); - tr2_1 = _mm_srai_epi16(tr2_1, 2); - tr2_2 = _mm_srai_epi16(tr2_2, 2); - tr2_3 = _mm_srai_epi16(tr2_3, 2); - tr2_4 = _mm_srai_epi16(tr2_4, 2); - tr2_5 = _mm_srai_epi16(tr2_5, 2); - tr2_6 = _mm_srai_epi16(tr2_6, 2); - tr2_7 = _mm_srai_epi16(tr2_7, 2); - } - // Note: even though all these stores are aligned, using the aligned - // intrinsic make the code slightly slower. - if (pass == 0) { - _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0); - _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1); - _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2); - _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3); - _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4); - _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5); - _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6); - _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7); - // Process next 8x8 - output0 += 8; - } else { - storeu_output(&tr2_0, (output1 + 0 * 32)); - storeu_output(&tr2_1, (output1 + 1 * 32)); - storeu_output(&tr2_2, (output1 + 2 * 32)); - storeu_output(&tr2_3, (output1 + 3 * 32)); - storeu_output(&tr2_4, (output1 + 4 * 32)); - storeu_output(&tr2_5, (output1 + 5 * 32)); - storeu_output(&tr2_6, (output1 + 6 * 32)); - storeu_output(&tr2_7, (output1 + 7 * 32)); - // Process next 8x8 - output1 += 8; - } - } - } - } - } -} // NOLINT - -#undef ADD_EPI16 -#undef SUB_EPI16 -#undef HIGH_FDCT32x32_2D_C -#undef HIGH_FDCT32x32_2D_ROWS_C
diff --git a/av1/common/x86/av1_fwd_txfm1d_sse4.c b/av1/common/x86/av1_fwd_txfm1d_sse4.c index f0bcef9..c09a019 100644 --- a/av1/common/x86/av1_fwd_txfm1d_sse4.c +++ b/av1/common/x86/av1_fwd_txfm1d_sse4.c
@@ -1,354 +1,5 @@ #include "av1/common/x86/av1_txfm1d_sse4.h" -void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output, - const int8_t *cos_bit, const int8_t *stage_range) { - const int txfm_size = 4; - const int num_per_128 = 4; - const int32_t *cospi; - __m128i buf0[4]; - __m128i buf1[4]; - int col_num = txfm_size / num_per_128; - int bit; - int col; - (void)stage_range; - for (col = 0; col < col_num; col++) { - // stage 0; - int32_t stage_idx = 0; - buf0[0] = input[0 * col_num + col]; - buf0[1] = input[1 * col_num + col]; - buf0[2] = input[2 * col_num + col]; - buf0[3] = input[3 * col_num + col]; - - // stage 1 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf1[0] = _mm_add_epi32(buf0[0], buf0[3]); - buf1[3] = _mm_sub_epi32(buf0[0], buf0[3]); - buf1[1] = _mm_add_epi32(buf0[1], buf0[2]); - buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]); - - // stage 2 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0], - buf0[1], bit); - btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2], - buf0[3], bit); - - // stage 3 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf1[0] = buf0[0]; - buf1[1] = buf0[2]; - buf1[2] = buf0[1]; - buf1[3] = buf0[3]; - - output[0 * col_num + col] = buf1[0]; - output[1 * col_num + col] = buf1[1]; - output[2 * col_num + col] = buf1[2]; - output[3 * col_num + col] = buf1[3]; - } -} - -void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output, - const int8_t *cos_bit, const int8_t *stage_range) { - const int txfm_size = 8; - const int num_per_128 = 4; - const int32_t *cospi; - __m128i buf0[8]; - __m128i buf1[8]; - int col_num = txfm_size / num_per_128; - int bit; - int col; - (void)stage_range; - for (col = 0; col < col_num; col++) { - // stage 0; - int32_t stage_idx = 0; - buf0[0] = input[0 * col_num + col]; - buf0[1] = input[1 * col_num + col]; - buf0[2] = input[2 * col_num + col]; - buf0[3] = input[3 * col_num + col]; - buf0[4] = input[4 * col_num + col]; - buf0[5] = input[5 * col_num + col]; - buf0[6] = input[6 * col_num + col]; - buf0[7] = input[7 * col_num + col]; - - // stage 1 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf1[0] = _mm_add_epi32(buf0[0], buf0[7]); - buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]); - buf1[1] = _mm_add_epi32(buf0[1], buf0[6]); - buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]); - buf1[2] = _mm_add_epi32(buf0[2], buf0[5]); - buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]); - buf1[3] = _mm_add_epi32(buf0[3], buf0[4]); - buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]); - - // stage 2 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf0[0] = _mm_add_epi32(buf1[0], buf1[3]); - buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]); - buf0[1] = _mm_add_epi32(buf1[1], buf1[2]); - buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]); - buf0[4] = buf1[4]; - btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], - buf0[6], bit); - buf0[7] = buf1[7]; - - // stage 3 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], - buf1[1], bit); - btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], - buf1[3], bit); - buf1[4] = _mm_add_epi32(buf0[4], buf0[5]); - buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]); - buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]); - buf1[7] = _mm_add_epi32(buf0[7], buf0[6]); - - // stage 4 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf0[0] = buf1[0]; - buf0[1] = buf1[1]; - buf0[2] = buf1[2]; - buf0[3] = buf1[3]; - btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7], - bit); - btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], - buf0[6], bit); - - // stage 5 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf1[0] = buf0[0]; - buf1[1] = buf0[4]; - buf1[2] = buf0[2]; - buf1[3] = buf0[6]; - buf1[4] = buf0[1]; - buf1[5] = buf0[5]; - buf1[6] = buf0[3]; - buf1[7] = buf0[7]; - - output[0 * col_num + col] = buf1[0]; - output[1 * col_num + col] = buf1[1]; - output[2 * col_num + col] = buf1[2]; - output[3 * col_num + col] = buf1[3]; - output[4 * col_num + col] = buf1[4]; - output[5 * col_num + col] = buf1[5]; - output[6 * col_num + col] = buf1[6]; - output[7 * col_num + col] = buf1[7]; - } -} - -void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output, - const int8_t *cos_bit, const int8_t *stage_range) { - const int txfm_size = 16; - const int num_per_128 = 4; - const int32_t *cospi; - __m128i buf0[16]; - __m128i buf1[16]; - int col_num = txfm_size / num_per_128; - int bit; - int col; - (void)stage_range; - for (col = 0; col < col_num; col++) { - // stage 0; - int32_t stage_idx = 0; - buf0[0] = input[0 * col_num + col]; - buf0[1] = input[1 * col_num + col]; - buf0[2] = input[2 * col_num + col]; - buf0[3] = input[3 * col_num + col]; - buf0[4] = input[4 * col_num + col]; - buf0[5] = input[5 * col_num + col]; - buf0[6] = input[6 * col_num + col]; - buf0[7] = input[7 * col_num + col]; - buf0[8] = input[8 * col_num + col]; - buf0[9] = input[9 * col_num + col]; - buf0[10] = input[10 * col_num + col]; - buf0[11] = input[11 * col_num + col]; - buf0[12] = input[12 * col_num + col]; - buf0[13] = input[13 * col_num + col]; - buf0[14] = input[14 * col_num + col]; - buf0[15] = input[15 * col_num + col]; - - // stage 1 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf1[0] = _mm_add_epi32(buf0[0], buf0[15]); - buf1[15] = _mm_sub_epi32(buf0[0], buf0[15]); - buf1[1] = _mm_add_epi32(buf0[1], buf0[14]); - buf1[14] = _mm_sub_epi32(buf0[1], buf0[14]); - buf1[2] = _mm_add_epi32(buf0[2], buf0[13]); - buf1[13] = _mm_sub_epi32(buf0[2], buf0[13]); - buf1[3] = _mm_add_epi32(buf0[3], buf0[12]); - buf1[12] = _mm_sub_epi32(buf0[3], buf0[12]); - buf1[4] = _mm_add_epi32(buf0[4], buf0[11]); - buf1[11] = _mm_sub_epi32(buf0[4], buf0[11]); - buf1[5] = _mm_add_epi32(buf0[5], buf0[10]); - buf1[10] = _mm_sub_epi32(buf0[5], buf0[10]); - buf1[6] = _mm_add_epi32(buf0[6], buf0[9]); - buf1[9] = _mm_sub_epi32(buf0[6], buf0[9]); - buf1[7] = _mm_add_epi32(buf0[7], buf0[8]); - buf1[8] = _mm_sub_epi32(buf0[7], buf0[8]); - - // stage 2 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf0[0] = _mm_add_epi32(buf1[0], buf1[7]); - buf0[7] = _mm_sub_epi32(buf1[0], buf1[7]); - buf0[1] = _mm_add_epi32(buf1[1], buf1[6]); - buf0[6] = _mm_sub_epi32(buf1[1], buf1[6]); - buf0[2] = _mm_add_epi32(buf1[2], buf1[5]); - buf0[5] = _mm_sub_epi32(buf1[2], buf1[5]); - buf0[3] = _mm_add_epi32(buf1[3], buf1[4]); - buf0[4] = _mm_sub_epi32(buf1[3], buf1[4]); - buf0[8] = buf1[8]; - buf0[9] = buf1[9]; - btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10], - buf0[13], bit); - btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11], - buf0[12], bit); - buf0[14] = buf1[14]; - buf0[15] = buf1[15]; - - // stage 3 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf1[0] = _mm_add_epi32(buf0[0], buf0[3]); - buf1[3] = _mm_sub_epi32(buf0[0], buf0[3]); - buf1[1] = _mm_add_epi32(buf0[1], buf0[2]); - buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]); - buf1[4] = buf0[4]; - btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5], - buf1[6], bit); - buf1[7] = buf0[7]; - buf1[8] = _mm_add_epi32(buf0[8], buf0[11]); - buf1[11] = _mm_sub_epi32(buf0[8], buf0[11]); - buf1[9] = _mm_add_epi32(buf0[9], buf0[10]); - buf1[10] = _mm_sub_epi32(buf0[9], buf0[10]); - buf1[12] = _mm_sub_epi32(buf0[15], buf0[12]); - buf1[15] = _mm_add_epi32(buf0[15], buf0[12]); - buf1[13] = _mm_sub_epi32(buf0[14], buf0[13]); - buf1[14] = _mm_add_epi32(buf0[14], buf0[13]); - - // stage 4 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0], - buf0[1], bit); - btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2], - buf0[3], bit); - buf0[4] = _mm_add_epi32(buf1[4], buf1[5]); - buf0[5] = _mm_sub_epi32(buf1[4], buf1[5]); - buf0[6] = _mm_sub_epi32(buf1[7], buf1[6]); - buf0[7] = _mm_add_epi32(buf1[7], buf1[6]); - buf0[8] = buf1[8]; - btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9], - buf0[14], bit); - btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10], - buf0[13], bit); - buf0[11] = buf1[11]; - buf0[12] = buf1[12]; - buf0[15] = buf1[15]; - - // stage 5 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf1[0] = buf0[0]; - buf1[1] = buf0[1]; - buf1[2] = buf0[2]; - buf1[3] = buf0[3]; - btf_32_sse4_1_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7], - bit); - btf_32_sse4_1_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5], - buf1[6], bit); - buf1[8] = _mm_add_epi32(buf0[8], buf0[9]); - buf1[9] = _mm_sub_epi32(buf0[8], buf0[9]); - buf1[10] = _mm_sub_epi32(buf0[11], buf0[10]); - buf1[11] = _mm_add_epi32(buf0[11], buf0[10]); - buf1[12] = _mm_add_epi32(buf0[12], buf0[13]); - buf1[13] = _mm_sub_epi32(buf0[12], buf0[13]); - buf1[14] = _mm_sub_epi32(buf0[15], buf0[14]); - buf1[15] = _mm_add_epi32(buf0[15], buf0[14]); - - // stage 6 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf0[0] = buf1[0]; - buf0[1] = buf1[1]; - buf0[2] = buf1[2]; - buf0[3] = buf1[3]; - buf0[4] = buf1[4]; - buf0[5] = buf1[5]; - buf0[6] = buf1[6]; - buf0[7] = buf1[7]; - btf_32_sse4_1_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8], - buf0[15], bit); - btf_32_sse4_1_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9], - buf0[14], bit); - btf_32_sse4_1_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10], - buf0[13], bit); - btf_32_sse4_1_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11], - buf0[12], bit); - - // stage 7 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf1[0] = buf0[0]; - buf1[1] = buf0[8]; - buf1[2] = buf0[4]; - buf1[3] = buf0[12]; - buf1[4] = buf0[2]; - buf1[5] = buf0[10]; - buf1[6] = buf0[6]; - buf1[7] = buf0[14]; - buf1[8] = buf0[1]; - buf1[9] = buf0[9]; - buf1[10] = buf0[5]; - buf1[11] = buf0[13]; - buf1[12] = buf0[3]; - buf1[13] = buf0[11]; - buf1[14] = buf0[7]; - buf1[15] = buf0[15]; - - output[0 * col_num + col] = buf1[0]; - output[1 * col_num + col] = buf1[1]; - output[2 * col_num + col] = buf1[2]; - output[3 * col_num + col] = buf1[3]; - output[4 * col_num + col] = buf1[4]; - output[5 * col_num + col] = buf1[5]; - output[6 * col_num + col] = buf1[6]; - output[7 * col_num + col] = buf1[7]; - output[8 * col_num + col] = buf1[8]; - output[9 * col_num + col] = buf1[9]; - output[10 * col_num + col] = buf1[10]; - output[11 * col_num + col] = buf1[11]; - output[12 * col_num + col] = buf1[12]; - output[13 * col_num + col] = buf1[13]; - output[14 * col_num + col] = buf1[14]; - output[15 * col_num + col] = buf1[15]; - } -} - void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output, const int8_t *cos_bit, const int8_t *stage_range) { const int txfm_size = 32; @@ -835,370 +486,6 @@ } } -void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output, - const int8_t *cos_bit, const int8_t *stage_range) { - const int txfm_size = 8; - const int num_per_128 = 4; - const int32_t *cospi; - __m128i buf0[8]; - __m128i buf1[8]; - int col_num = txfm_size / num_per_128; - int bit; - int col; - (void)stage_range; - for (col = 0; col < col_num; col++) { - // stage 0; - int32_t stage_idx = 0; - buf0[0] = input[0 * col_num + col]; - buf0[1] = input[1 * col_num + col]; - buf0[2] = input[2 * col_num + col]; - buf0[3] = input[3 * col_num + col]; - buf0[4] = input[4 * col_num + col]; - buf0[5] = input[5 * col_num + col]; - buf0[6] = input[6 * col_num + col]; - buf0[7] = input[7 * col_num + col]; - - // stage 1 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf1[0] = buf0[7]; - buf1[1] = buf0[0]; - buf1[2] = buf0[5]; - buf1[3] = buf0[2]; - buf1[4] = buf0[3]; - buf1[5] = buf0[4]; - buf1[6] = buf0[1]; - buf1[7] = buf0[6]; - - // stage 2 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[0], buf1[1], buf0[0], buf0[1], - bit); - btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[2], buf1[3], buf0[2], - buf0[3], bit); - btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[4], buf1[5], buf0[4], - buf0[5], bit); - btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[6], buf1[7], buf0[6], - buf0[7], bit); - - // stage 3 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf1[0] = _mm_add_epi32(buf0[0], buf0[4]); - buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]); - buf1[1] = _mm_add_epi32(buf0[1], buf0[5]); - buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]); - buf1[2] = _mm_add_epi32(buf0[2], buf0[6]); - buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]); - buf1[3] = _mm_add_epi32(buf0[3], buf0[7]); - buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]); - - // stage 4 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf0[0] = buf1[0]; - buf0[1] = buf1[1]; - buf0[2] = buf1[2]; - buf0[3] = buf1[3]; - btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4], - buf0[5], bit); - btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6], - buf0[7], bit); - - // stage 5 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf1[0] = _mm_add_epi32(buf0[0], buf0[2]); - buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]); - buf1[1] = _mm_add_epi32(buf0[1], buf0[3]); - buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]); - buf1[4] = _mm_add_epi32(buf0[4], buf0[6]); - buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]); - buf1[5] = _mm_add_epi32(buf0[5], buf0[7]); - buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]); - - // stage 6 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf0[0] = buf1[0]; - buf0[1] = buf1[1]; - btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], - buf0[3], bit); - buf0[4] = buf1[4]; - buf0[5] = buf1[5]; - btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6], - buf0[7], bit); - - // stage 7 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf1[0] = buf0[0]; - buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]); - buf1[2] = buf0[6]; - buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]); - buf1[4] = buf0[3]; - buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]); - buf1[6] = buf0[5]; - buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]); - - output[0 * col_num + col] = buf1[0]; - output[1 * col_num + col] = buf1[1]; - output[2 * col_num + col] = buf1[2]; - output[3 * col_num + col] = buf1[3]; - output[4 * col_num + col] = buf1[4]; - output[5 * col_num + col] = buf1[5]; - output[6 * col_num + col] = buf1[6]; - output[7 * col_num + col] = buf1[7]; - } -} - -void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output, - const int8_t *cos_bit, const int8_t *stage_range) { - const int txfm_size = 16; - const int num_per_128 = 4; - const int32_t *cospi; - __m128i buf0[16]; - __m128i buf1[16]; - int col_num = txfm_size / num_per_128; - int bit; - int col; - (void)stage_range; - for (col = 0; col < col_num; col++) { - // stage 0; - int32_t stage_idx = 0; - buf0[0] = input[0 * col_num + col]; - buf0[1] = input[1 * col_num + col]; - buf0[2] = input[2 * col_num + col]; - buf0[3] = input[3 * col_num + col]; - buf0[4] = input[4 * col_num + col]; - buf0[5] = input[5 * col_num + col]; - buf0[6] = input[6 * col_num + col]; - buf0[7] = input[7 * col_num + col]; - buf0[8] = input[8 * col_num + col]; - buf0[9] = input[9 * col_num + col]; - buf0[10] = input[10 * col_num + col]; - buf0[11] = input[11 * col_num + col]; - buf0[12] = input[12 * col_num + col]; - buf0[13] = input[13 * col_num + col]; - buf0[14] = input[14 * col_num + col]; - buf0[15] = input[15 * col_num + col]; - - // stage 1 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf1[0] = buf0[15]; - buf1[1] = buf0[0]; - buf1[2] = buf0[13]; - buf1[3] = buf0[2]; - buf1[4] = buf0[11]; - buf1[5] = buf0[4]; - buf1[6] = buf0[9]; - buf1[7] = buf0[6]; - buf1[8] = buf0[7]; - buf1[9] = buf0[8]; - buf1[10] = buf0[5]; - buf1[11] = buf0[10]; - buf1[12] = buf0[3]; - buf1[13] = buf0[12]; - buf1[14] = buf0[1]; - buf1[15] = buf0[14]; - - // stage 2 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - btf_32_sse4_1_type0(cospi[2], cospi[62], buf1[0], buf1[1], buf0[0], buf0[1], - bit); - btf_32_sse4_1_type0(cospi[10], cospi[54], buf1[2], buf1[3], buf0[2], - buf0[3], bit); - btf_32_sse4_1_type0(cospi[18], cospi[46], buf1[4], buf1[5], buf0[4], - buf0[5], bit); - btf_32_sse4_1_type0(cospi[26], cospi[38], buf1[6], buf1[7], buf0[6], - buf0[7], bit); - btf_32_sse4_1_type0(cospi[34], cospi[30], buf1[8], buf1[9], buf0[8], - buf0[9], bit); - btf_32_sse4_1_type0(cospi[42], cospi[22], buf1[10], buf1[11], buf0[10], - buf0[11], bit); - btf_32_sse4_1_type0(cospi[50], cospi[14], buf1[12], buf1[13], buf0[12], - buf0[13], bit); - btf_32_sse4_1_type0(cospi[58], cospi[6], buf1[14], buf1[15], buf0[14], - buf0[15], bit); - - // stage 3 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf1[0] = _mm_add_epi32(buf0[0], buf0[8]); - buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]); - buf1[1] = _mm_add_epi32(buf0[1], buf0[9]); - buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]); - buf1[2] = _mm_add_epi32(buf0[2], buf0[10]); - buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]); - buf1[3] = _mm_add_epi32(buf0[3], buf0[11]); - buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]); - buf1[4] = _mm_add_epi32(buf0[4], buf0[12]); - buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]); - buf1[5] = _mm_add_epi32(buf0[5], buf0[13]); - buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]); - buf1[6] = _mm_add_epi32(buf0[6], buf0[14]); - buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]); - buf1[7] = _mm_add_epi32(buf0[7], buf0[15]); - buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]); - - // stage 4 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf0[0] = buf1[0]; - buf0[1] = buf1[1]; - buf0[2] = buf1[2]; - buf0[3] = buf1[3]; - buf0[4] = buf1[4]; - buf0[5] = buf1[5]; - buf0[6] = buf1[6]; - buf0[7] = buf1[7]; - btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9], - bit); - btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10], - buf0[11], bit); - btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12], - buf0[13], bit); - btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14], - buf0[15], bit); - - // stage 5 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf1[0] = _mm_add_epi32(buf0[0], buf0[4]); - buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]); - buf1[1] = _mm_add_epi32(buf0[1], buf0[5]); - buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]); - buf1[2] = _mm_add_epi32(buf0[2], buf0[6]); - buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]); - buf1[3] = _mm_add_epi32(buf0[3], buf0[7]); - buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]); - buf1[8] = _mm_add_epi32(buf0[8], buf0[12]); - buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]); - buf1[9] = _mm_add_epi32(buf0[9], buf0[13]); - buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]); - buf1[10] = _mm_add_epi32(buf0[10], buf0[14]); - buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]); - buf1[11] = _mm_add_epi32(buf0[11], buf0[15]); - buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]); - - // stage 6 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf0[0] = buf1[0]; - buf0[1] = buf1[1]; - buf0[2] = buf1[2]; - buf0[3] = buf1[3]; - btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4], - buf0[5], bit); - btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6], - buf0[7], bit); - buf0[8] = buf1[8]; - buf0[9] = buf1[9]; - buf0[10] = buf1[10]; - buf0[11] = buf1[11]; - btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12], - buf0[13], bit); - btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14], - buf0[15], bit); - - // stage 7 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf1[0] = _mm_add_epi32(buf0[0], buf0[2]); - buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]); - buf1[1] = _mm_add_epi32(buf0[1], buf0[3]); - buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]); - buf1[4] = _mm_add_epi32(buf0[4], buf0[6]); - buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]); - buf1[5] = _mm_add_epi32(buf0[5], buf0[7]); - buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]); - buf1[8] = _mm_add_epi32(buf0[8], buf0[10]); - buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]); - buf1[9] = _mm_add_epi32(buf0[9], buf0[11]); - buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]); - buf1[12] = _mm_add_epi32(buf0[12], buf0[14]); - buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]); - buf1[13] = _mm_add_epi32(buf0[13], buf0[15]); - buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]); - - // stage 8 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf0[0] = buf1[0]; - buf0[1] = buf1[1]; - btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], - buf0[3], bit); - buf0[4] = buf1[4]; - buf0[5] = buf1[5]; - btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6], - buf0[7], bit); - buf0[8] = buf1[8]; - buf0[9] = buf1[9]; - btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10], - buf0[11], bit); - buf0[12] = buf1[12]; - buf0[13] = buf1[13]; - btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14], - buf0[15], bit); - - // stage 9 - stage_idx++; - bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; - buf1[0] = buf0[0]; - buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]); - buf1[2] = buf0[12]; - buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]); - buf1[4] = buf0[6]; - buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]); - buf1[6] = buf0[10]; - buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]); - buf1[8] = buf0[3]; - buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]); - buf1[10] = buf0[15]; - buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]); - buf1[12] = buf0[5]; - buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]); - buf1[14] = buf0[9]; - buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]); - - output[0 * col_num + col] = buf1[0]; - output[1 * col_num + col] = buf1[1]; - output[2 * col_num + col] = buf1[2]; - output[3 * col_num + col] = buf1[3]; - output[4 * col_num + col] = buf1[4]; - output[5 * col_num + col] = buf1[5]; - output[6 * col_num + col] = buf1[6]; - output[7 * col_num + col] = buf1[7]; - output[8 * col_num + col] = buf1[8]; - output[9 * col_num + col] = buf1[9]; - output[10 * col_num + col] = buf1[10]; - output[11 * col_num + col] = buf1[11]; - output[12 * col_num + col] = buf1[12]; - output[13 * col_num + col] = buf1[13]; - output[14 * col_num + col] = buf1[14]; - output[15 * col_num + col] = buf1[15]; - } -} - void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output, const int8_t *cos_bit, const int8_t *stage_range) { const int txfm_size = 32;
diff --git a/av1/common/x86/av1_fwd_txfm2d_sse4.c b/av1/common/x86/av1_fwd_txfm2d_sse4.c index 07c283e..3d60b36 100644 --- a/av1/common/x86/av1_fwd_txfm2d_sse4.c +++ b/av1/common/x86/av1_fwd_txfm2d_sse4.c
@@ -28,13 +28,7 @@ static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { switch (txfm_type) { - case TXFM_TYPE_DCT4: return av1_fdct4_new_sse4_1; break; - case TXFM_TYPE_DCT8: return av1_fdct8_new_sse4_1; break; - case TXFM_TYPE_DCT16: return av1_fdct16_new_sse4_1; break; case TXFM_TYPE_DCT32: return av1_fdct32_new_sse4_1; break; - case TXFM_TYPE_ADST4: return av1_fadst4_new_sse4_1; break; - case TXFM_TYPE_ADST8: return av1_fadst8_new_sse4_1; break; - case TXFM_TYPE_ADST16: return av1_fadst16_new_sse4_1; break; case TXFM_TYPE_ADST32: return av1_fadst32_new_sse4_1; break; default: assert(0); }
diff --git a/av1/common/x86/av1_fwd_txfm_impl_sse2.h b/av1/common/x86/av1_fwd_txfm_impl_sse2.h deleted file mode 100644 index 0e341ac..0000000 --- a/av1/common/x86/av1_fwd_txfm_impl_sse2.h +++ /dev/null
@@ -1,1014 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> // SSE2 - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/txfm_common.h" -#include "aom_dsp/x86/fwd_txfm_sse2.h" -#include "aom_dsp/x86/txfm_common_sse2.h" -#include "aom_ports/mem.h" - -// TODO(jingning) The high bit-depth functions need rework for performance. -// After we properly fix the high bit-depth function implementations, this -// file's dependency should be substantially simplified. -#if DCT_HIGH_BIT_DEPTH -#define ADD_EPI16 _mm_adds_epi16 -#define SUB_EPI16 _mm_subs_epi16 - -#else -#define ADD_EPI16 _mm_add_epi16 -#define SUB_EPI16 _mm_sub_epi16 -#endif - -void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { - // This 2D transform implements 4 vertical 1D transforms followed - // by 4 horizontal 1D transforms. The multiplies and adds are as given - // by Chen, Smith and Fralick ('77). The commands for moving the data - // around have been minimized by hand. - // For the purposes of the comments, the 16 inputs are referred to at i0 - // through iF (in raster order), intermediate variables are a0, b0, c0 - // through f, and correspond to the in-place computations mapped to input - // locations. The outputs, o0 through oF are labeled according to the - // output locations. - - // Constants - // These are the coefficients used for the multiplies. - // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64), - // where cospi_N_64 = cos(N pi /64) - const __m128i k__cospi_A = - octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, - cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); - const __m128i k__cospi_B = - octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, - cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); - const __m128i k__cospi_C = - octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, - cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64); - const __m128i k__cospi_D = - octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, - cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64); - const __m128i k__cospi_E = - octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, - cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); - const __m128i k__cospi_F = - octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, - cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); - const __m128i k__cospi_G = - octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, - -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64); - const __m128i k__cospi_H = - octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, - -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64); - - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - // This second rounding constant saves doing some extra adds at the end - const __m128i k__DCT_CONST_ROUNDING2 = - _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1)); - const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2; - const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); - const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); - __m128i in0, in1; -#if DCT_HIGH_BIT_DEPTH - __m128i cmp0, cmp1; - int test, overflow; -#endif - - // Load inputs. - in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in1 = _mm_unpacklo_epi64( - in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); - in0 = _mm_unpacklo_epi64( - in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); -// in0 = [i0 i1 i2 i3 iC iD iE iF] -// in1 = [i4 i5 i6 i7 i8 i9 iA iB] -#if DCT_HIGH_BIT_DEPTH - // Check inputs small enough to use optimised code - cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)), - _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00))); - cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)), - _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00))); - test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1)); - if (test) { - aom_highbd_fdct4x4_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - - // multiply by 16 to give some extra precision - in0 = _mm_slli_epi16(in0, 4); - in1 = _mm_slli_epi16(in1, 4); - // if (i == 0 && input[0]) input[0] += 1; - // add 1 to the upper left pixel if it is non-zero, which helps reduce - // the round-trip error - { - // The mask will only contain whether the first value is zero, all - // other comparison will fail as something shifted by 4 (above << 4) - // can never be equal to one. To increment in the non-zero case, we - // add the mask and one for the first element: - // - if zero, mask = -1, v = v - 1 + 1 = v - // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 - __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); - in0 = _mm_add_epi16(in0, mask); - in0 = _mm_add_epi16(in0, k__nonzero_bias_b); - } - // There are 4 total stages, alternating between an add/subtract stage - // followed by an multiply-and-add stage. - { - // Stage 1: Add/subtract - - // in0 = [i0 i1 i2 i3 iC iD iE iF] - // in1 = [i4 i5 i6 i7 i8 i9 iA iB] - const __m128i r0 = _mm_unpacklo_epi16(in0, in1); - const __m128i r1 = _mm_unpackhi_epi16(in0, in1); - // r0 = [i0 i4 i1 i5 i2 i6 i3 i7] - // r1 = [iC i8 iD i9 iE iA iF iB] - const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4); - const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4); - // r2 = [i0 i4 i1 i5 i3 i7 i2 i6] - // r3 = [iC i8 iD i9 iF iB iE iA] - - const __m128i t0 = _mm_add_epi16(r2, r3); - const __m128i t1 = _mm_sub_epi16(r2, r3); - // t0 = [a0 a4 a1 a5 a3 a7 a2 a6] - // t1 = [aC a8 aD a9 aF aB aE aA] - - // Stage 2: multiply by constants (which gets us into 32 bits). - // The constants needed here are: - // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16] - // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16] - // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08] - // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24] - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D); - // Then add and right-shift to get back to 16-bit range - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // w0 = [b0 b1 b7 b6] - // w1 = [b8 b9 bF bE] - // w2 = [b4 b5 b3 b2] - // w3 = [bC bD bB bA] - const __m128i x0 = _mm_packs_epi32(w0, w1); - const __m128i x1 = _mm_packs_epi32(w2, w3); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(&x0, &x1); - if (overflow) { - aom_highbd_fdct4x4_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - // x0 = [b0 b1 b7 b6 b8 b9 bF bE] - // x1 = [b4 b5 b3 b2 bC bD bB bA] - in0 = _mm_shuffle_epi32(x0, 0xD8); - in1 = _mm_shuffle_epi32(x1, 0x8D); - // in0 = [b0 b1 b8 b9 b7 b6 bF bE] - // in1 = [b3 b2 bB bA b4 b5 bC bD] - } - { - // vertical DCTs finished. Now we do the horizontal DCTs. - // Stage 3: Add/subtract - - // t0 = [c0 c1 c8 c9 c4 c5 cC cD] - // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE] - const __m128i t0 = ADD_EPI16(in0, in1); - const __m128i t1 = SUB_EPI16(in0, in1); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(&t0, &t1); - if (overflow) { - aom_highbd_fdct4x4_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - - // Stage 4: multiply by constants (which gets us into 32 bits). - { - // The constants needed here are: - // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16] - // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16] - // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24] - // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08] - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E); - const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F); - const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H); - // Then add and right-shift to get back to 16-bit range - // but this combines the final right-shift as well to save operations - // This unusual rounding operations is to maintain bit-accurate - // compatibility with the c version of this function which has two - // rounding steps in a row. - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2); - // w0 = [o0 o4 o8 oC] - // w1 = [o2 o6 oA oE] - // w2 = [o1 o5 o9 oD] - // w3 = [o3 o7 oB oF] - // remember the o's are numbered according to the correct output location - const __m128i x0 = _mm_packs_epi32(w0, w1); - const __m128i x1 = _mm_packs_epi32(w2, w3); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(&x0, &x1); - if (overflow) { - aom_highbd_fdct4x4_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - { - // x0 = [o0 o4 o8 oC o2 o6 oA oE] - // x1 = [o1 o5 o9 oD o3 o7 oB oF] - const __m128i y0 = _mm_unpacklo_epi16(x0, x1); - const __m128i y1 = _mm_unpackhi_epi16(x0, x1); - // y0 = [o0 o1 o4 o5 o8 o9 oC oD] - // y1 = [o2 o3 o6 o7 oA oB oE oF] - in0 = _mm_unpacklo_epi32(y0, y1); - // in0 = [o0 o1 o2 o3 o4 o5 o6 o7] - in1 = _mm_unpackhi_epi32(y0, y1); - // in1 = [o8 o9 oA oB oC oD oE oF] - } - } - } - // Post-condition (v + 1) >> 2 is now incorporated into previous - // add and right-shift commands. Only 2 store instructions needed - // because we are using the fact that 1/3 are stored just after 0/2. - storeu_output(&in0, output + 0 * 4); - storeu_output(&in1, output + 2 * 4); -} - -void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { - int pass; - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); -#if DCT_HIGH_BIT_DEPTH - int overflow; -#endif - // Load input - __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); - __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); - __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); - __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); - __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); - __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); - __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); - __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); - // Pre-condition input (shift by two) - in0 = _mm_slli_epi16(in0, 2); - in1 = _mm_slli_epi16(in1, 2); - in2 = _mm_slli_epi16(in2, 2); - in3 = _mm_slli_epi16(in3, 2); - in4 = _mm_slli_epi16(in4, 2); - in5 = _mm_slli_epi16(in5, 2); - in6 = _mm_slli_epi16(in6, 2); - in7 = _mm_slli_epi16(in7, 2); - - // We do two passes, first the columns, then the rows. The results of the - // first pass are transposed so that the same column code can be reused. The - // results of the second pass are also transposed so that the rows (processed - // as columns) are put back in row positions. - for (pass = 0; pass < 2; pass++) { - // To store results of each pass before the transpose. - __m128i res0, res1, res2, res3, res4, res5, res6, res7; - // Add/subtract - const __m128i q0 = ADD_EPI16(in0, in7); - const __m128i q1 = ADD_EPI16(in1, in6); - const __m128i q2 = ADD_EPI16(in2, in5); - const __m128i q3 = ADD_EPI16(in3, in4); - const __m128i q4 = SUB_EPI16(in3, in4); - const __m128i q5 = SUB_EPI16(in2, in5); - const __m128i q6 = SUB_EPI16(in1, in6); - const __m128i q7 = SUB_EPI16(in0, in7); -#if DCT_HIGH_BIT_DEPTH - if (pass == 1) { - overflow = - check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); - if (overflow) { - aom_highbd_fdct8x8_c(input, output, stride); - return; - } - } -#endif // DCT_HIGH_BIT_DEPTH - // Work on first four results - { - // Add/subtract - const __m128i r0 = ADD_EPI16(q0, q3); - const __m128i r1 = ADD_EPI16(q1, q2); - const __m128i r2 = SUB_EPI16(q1, q2); - const __m128i r3 = SUB_EPI16(q0, q3); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); - if (overflow) { - aom_highbd_fdct8x8_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - // Interleave to do the multiply by constants which gets us into 32bits - { - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t1 = _mm_unpackhi_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); - const __m128i t3 = _mm_unpackhi_epi16(r2, r3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res0 = _mm_packs_epi32(w0, w1); - res4 = _mm_packs_epi32(w2, w3); - res2 = _mm_packs_epi32(w4, w5); - res6 = _mm_packs_epi32(w6, w7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6); - if (overflow) { - aom_highbd_fdct8x8_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - } - // Work on next four results - { - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i d0 = _mm_unpacklo_epi16(q6, q5); - const __m128i d1 = _mm_unpackhi_epi16(q6, q5); - const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); - const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); - const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); - const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); - const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); - const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); - const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); - const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); - const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); - const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); - const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); - // Combine - const __m128i r0 = _mm_packs_epi32(s0, s1); - const __m128i r1 = _mm_packs_epi32(s2, s3); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(&r0, &r1); - if (overflow) { - aom_highbd_fdct8x8_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - { - // Add/subtract - const __m128i x0 = ADD_EPI16(q4, r0); - const __m128i x1 = SUB_EPI16(q4, r0); - const __m128i x2 = SUB_EPI16(q7, r1); - const __m128i x3 = ADD_EPI16(q7, r1); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); - if (overflow) { - aom_highbd_fdct8x8_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - // Interleave to do the multiply by constants which gets us into 32bits - { - const __m128i t0 = _mm_unpacklo_epi16(x0, x3); - const __m128i t1 = _mm_unpackhi_epi16(x0, x3); - const __m128i t2 = _mm_unpacklo_epi16(x1, x2); - const __m128i t3 = _mm_unpackhi_epi16(x1, x2); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res1 = _mm_packs_epi32(w0, w1); - res7 = _mm_packs_epi32(w2, w3); - res5 = _mm_packs_epi32(w4, w5); - res3 = _mm_packs_epi32(w6, w7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3); - if (overflow) { - aom_highbd_fdct8x8_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - } - } - // Transpose the 8x8. - { - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); - const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); - const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); - const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); - const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); - const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); - const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); - const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - } - } - // Post-condition output and store it - { - // Post-condition (division by two) - // division of two 16 bits signed numbers using shifts - // n / 2 = (n - (n >> 15)) >> 1 - const __m128i sign_in0 = _mm_srai_epi16(in0, 15); - const __m128i sign_in1 = _mm_srai_epi16(in1, 15); - const __m128i sign_in2 = _mm_srai_epi16(in2, 15); - const __m128i sign_in3 = _mm_srai_epi16(in3, 15); - const __m128i sign_in4 = _mm_srai_epi16(in4, 15); - const __m128i sign_in5 = _mm_srai_epi16(in5, 15); - const __m128i sign_in6 = _mm_srai_epi16(in6, 15); - const __m128i sign_in7 = _mm_srai_epi16(in7, 15); - in0 = _mm_sub_epi16(in0, sign_in0); - in1 = _mm_sub_epi16(in1, sign_in1); - in2 = _mm_sub_epi16(in2, sign_in2); - in3 = _mm_sub_epi16(in3, sign_in3); - in4 = _mm_sub_epi16(in4, sign_in4); - in5 = _mm_sub_epi16(in5, sign_in5); - in6 = _mm_sub_epi16(in6, sign_in6); - in7 = _mm_sub_epi16(in7, sign_in7); - in0 = _mm_srai_epi16(in0, 1); - in1 = _mm_srai_epi16(in1, 1); - in2 = _mm_srai_epi16(in2, 1); - in3 = _mm_srai_epi16(in3, 1); - in4 = _mm_srai_epi16(in4, 1); - in5 = _mm_srai_epi16(in5, 1); - in6 = _mm_srai_epi16(in6, 1); - in7 = _mm_srai_epi16(in7, 1); - // store results - store_output(&in0, (output + 0 * 8)); - store_output(&in1, (output + 1 * 8)); - store_output(&in2, (output + 2 * 8)); - store_output(&in3, (output + 3 * 8)); - store_output(&in4, (output + 4 * 8)); - store_output(&in5, (output + 5 * 8)); - store_output(&in6, (output + 6 * 8)); - store_output(&in7, (output + 7 * 8)); - } -} - -void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { - // The 2D transform is done with two passes which are actually pretty - // similar. In the first one, we transform the columns and transpose - // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we transpose the columns (that - // is the transposed rows) and transpose the results (so that it goes back - // in normal/row positions). - int pass; - // We need an intermediate buffer between passes. - DECLARE_ALIGNED(16, int16_t, intermediate[256]); - const int16_t *in = input; - int16_t *out0 = intermediate; - tran_low_t *out1 = output; - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); - const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); - const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); - const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); - const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); - const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); - const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kOne = _mm_set1_epi16(1); - // Do the two transform/transpose passes - for (pass = 0; pass < 2; ++pass) { - // We process eight columns (transposed rows in second pass) at a time. - int column_start; -#if DCT_HIGH_BIT_DEPTH - int overflow; -#endif - for (column_start = 0; column_start < 16; column_start += 8) { - __m128i in00, in01, in02, in03, in04, in05, in06, in07; - __m128i in08, in09, in10, in11, in12, in13, in14, in15; - __m128i input0, input1, input2, input3, input4, input5, input6, input7; - __m128i step1_0, step1_1, step1_2, step1_3; - __m128i step1_4, step1_5, step1_6, step1_7; - __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; - __m128i step3_0, step3_1, step3_2, step3_3; - __m128i step3_4, step3_5, step3_6, step3_7; - __m128i res00, res01, res02, res03, res04, res05, res06, res07; - __m128i res08, res09, res10, res11, res12, res13, res14, res15; - // Load and pre-condition input. - if (0 == pass) { - in00 = _mm_load_si128((const __m128i *)(in + 0 * stride)); - in01 = _mm_load_si128((const __m128i *)(in + 1 * stride)); - in02 = _mm_load_si128((const __m128i *)(in + 2 * stride)); - in03 = _mm_load_si128((const __m128i *)(in + 3 * stride)); - in04 = _mm_load_si128((const __m128i *)(in + 4 * stride)); - in05 = _mm_load_si128((const __m128i *)(in + 5 * stride)); - in06 = _mm_load_si128((const __m128i *)(in + 6 * stride)); - in07 = _mm_load_si128((const __m128i *)(in + 7 * stride)); - in08 = _mm_load_si128((const __m128i *)(in + 8 * stride)); - in09 = _mm_load_si128((const __m128i *)(in + 9 * stride)); - in10 = _mm_load_si128((const __m128i *)(in + 10 * stride)); - in11 = _mm_load_si128((const __m128i *)(in + 11 * stride)); - in12 = _mm_load_si128((const __m128i *)(in + 12 * stride)); - in13 = _mm_load_si128((const __m128i *)(in + 13 * stride)); - in14 = _mm_load_si128((const __m128i *)(in + 14 * stride)); - in15 = _mm_load_si128((const __m128i *)(in + 15 * stride)); - // x = x << 2 - in00 = _mm_slli_epi16(in00, 2); - in01 = _mm_slli_epi16(in01, 2); - in02 = _mm_slli_epi16(in02, 2); - in03 = _mm_slli_epi16(in03, 2); - in04 = _mm_slli_epi16(in04, 2); - in05 = _mm_slli_epi16(in05, 2); - in06 = _mm_slli_epi16(in06, 2); - in07 = _mm_slli_epi16(in07, 2); - in08 = _mm_slli_epi16(in08, 2); - in09 = _mm_slli_epi16(in09, 2); - in10 = _mm_slli_epi16(in10, 2); - in11 = _mm_slli_epi16(in11, 2); - in12 = _mm_slli_epi16(in12, 2); - in13 = _mm_slli_epi16(in13, 2); - in14 = _mm_slli_epi16(in14, 2); - in15 = _mm_slli_epi16(in15, 2); - } else { - in00 = _mm_load_si128((const __m128i *)(in + 0 * 16)); - in01 = _mm_load_si128((const __m128i *)(in + 1 * 16)); - in02 = _mm_load_si128((const __m128i *)(in + 2 * 16)); - in03 = _mm_load_si128((const __m128i *)(in + 3 * 16)); - in04 = _mm_load_si128((const __m128i *)(in + 4 * 16)); - in05 = _mm_load_si128((const __m128i *)(in + 5 * 16)); - in06 = _mm_load_si128((const __m128i *)(in + 6 * 16)); - in07 = _mm_load_si128((const __m128i *)(in + 7 * 16)); - in08 = _mm_load_si128((const __m128i *)(in + 8 * 16)); - in09 = _mm_load_si128((const __m128i *)(in + 9 * 16)); - in10 = _mm_load_si128((const __m128i *)(in + 10 * 16)); - in11 = _mm_load_si128((const __m128i *)(in + 11 * 16)); - in12 = _mm_load_si128((const __m128i *)(in + 12 * 16)); - in13 = _mm_load_si128((const __m128i *)(in + 13 * 16)); - in14 = _mm_load_si128((const __m128i *)(in + 14 * 16)); - in15 = _mm_load_si128((const __m128i *)(in + 15 * 16)); - // x = (x + 1) >> 2 - in00 = _mm_add_epi16(in00, kOne); - in01 = _mm_add_epi16(in01, kOne); - in02 = _mm_add_epi16(in02, kOne); - in03 = _mm_add_epi16(in03, kOne); - in04 = _mm_add_epi16(in04, kOne); - in05 = _mm_add_epi16(in05, kOne); - in06 = _mm_add_epi16(in06, kOne); - in07 = _mm_add_epi16(in07, kOne); - in08 = _mm_add_epi16(in08, kOne); - in09 = _mm_add_epi16(in09, kOne); - in10 = _mm_add_epi16(in10, kOne); - in11 = _mm_add_epi16(in11, kOne); - in12 = _mm_add_epi16(in12, kOne); - in13 = _mm_add_epi16(in13, kOne); - in14 = _mm_add_epi16(in14, kOne); - in15 = _mm_add_epi16(in15, kOne); - in00 = _mm_srai_epi16(in00, 2); - in01 = _mm_srai_epi16(in01, 2); - in02 = _mm_srai_epi16(in02, 2); - in03 = _mm_srai_epi16(in03, 2); - in04 = _mm_srai_epi16(in04, 2); - in05 = _mm_srai_epi16(in05, 2); - in06 = _mm_srai_epi16(in06, 2); - in07 = _mm_srai_epi16(in07, 2); - in08 = _mm_srai_epi16(in08, 2); - in09 = _mm_srai_epi16(in09, 2); - in10 = _mm_srai_epi16(in10, 2); - in11 = _mm_srai_epi16(in11, 2); - in12 = _mm_srai_epi16(in12, 2); - in13 = _mm_srai_epi16(in13, 2); - in14 = _mm_srai_epi16(in14, 2); - in15 = _mm_srai_epi16(in15, 2); - } - in += 8; - // Calculate input for the first 8 results. - { - input0 = ADD_EPI16(in00, in15); - input1 = ADD_EPI16(in01, in14); - input2 = ADD_EPI16(in02, in13); - input3 = ADD_EPI16(in03, in12); - input4 = ADD_EPI16(in04, in11); - input5 = ADD_EPI16(in05, in10); - input6 = ADD_EPI16(in06, in09); - input7 = ADD_EPI16(in07, in08); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3, - &input4, &input5, &input6, &input7); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // Calculate input for the next 8 results. - { - step1_0 = SUB_EPI16(in07, in08); - step1_1 = SUB_EPI16(in06, in09); - step1_2 = SUB_EPI16(in05, in10); - step1_3 = SUB_EPI16(in04, in11); - step1_4 = SUB_EPI16(in03, in12); - step1_5 = SUB_EPI16(in02, in13); - step1_6 = SUB_EPI16(in01, in14); - step1_7 = SUB_EPI16(in00, in15); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3, - &step1_4, &step1_5, &step1_6, &step1_7); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // Work on the first eight values; fdct8(input, even_results); - { - // Add/subtract - const __m128i q0 = ADD_EPI16(input0, input7); - const __m128i q1 = ADD_EPI16(input1, input6); - const __m128i q2 = ADD_EPI16(input2, input5); - const __m128i q3 = ADD_EPI16(input3, input4); - const __m128i q4 = SUB_EPI16(input3, input4); - const __m128i q5 = SUB_EPI16(input2, input5); - const __m128i q6 = SUB_EPI16(input1, input6); - const __m128i q7 = SUB_EPI16(input0, input7); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - // Work on first four results - { - // Add/subtract - const __m128i r0 = ADD_EPI16(q0, q3); - const __m128i r1 = ADD_EPI16(q1, q2); - const __m128i r2 = SUB_EPI16(q1, q2); - const __m128i r3 = SUB_EPI16(q0, q3); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - // Interleave to do the multiply by constants which gets us - // into 32 bits. - { - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t1 = _mm_unpackhi_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); - const __m128i t3 = _mm_unpackhi_epi16(r2, r3); - res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - } - // Work on next four results - { - // Interleave to do the multiply by constants which gets us - // into 32 bits. - const __m128i d0 = _mm_unpacklo_epi16(q6, q5); - const __m128i d1 = _mm_unpackhi_epi16(q6, q5); - const __m128i r0 = - mult_round_shift(&d0, &d1, &k__cospi_p16_m16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - const __m128i r1 = - mult_round_shift(&d0, &d1, &k__cospi_p16_p16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(&r0, &r1); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - { - // Add/subtract - const __m128i x0 = ADD_EPI16(q4, r0); - const __m128i x1 = SUB_EPI16(q4, r0); - const __m128i x2 = SUB_EPI16(q7, r1); - const __m128i x3 = ADD_EPI16(q7, r1); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - // Interleave to do the multiply by constants which gets us - // into 32 bits. - { - const __m128i t0 = _mm_unpacklo_epi16(x0, x3); - const __m128i t1 = _mm_unpackhi_epi16(x0, x3); - const __m128i t2 = _mm_unpacklo_epi16(x1, x2); - const __m128i t3 = _mm_unpackhi_epi16(x1, x2); - res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&res02, &res14, &res10, &res06); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - } - } - } - // Work on the next eight values; step1 -> odd_results - { - // step 2 - { - const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); - const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); - const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); - const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); - step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // step 3 - { - step3_0 = ADD_EPI16(step1_0, step2_3); - step3_1 = ADD_EPI16(step1_1, step2_2); - step3_2 = SUB_EPI16(step1_1, step2_2); - step3_3 = SUB_EPI16(step1_0, step2_3); - step3_4 = SUB_EPI16(step1_7, step2_4); - step3_5 = SUB_EPI16(step1_6, step2_5); - step3_6 = ADD_EPI16(step1_6, step2_5); - step3_7 = ADD_EPI16(step1_7, step2_4); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3, - &step3_4, &step3_5, &step3_6, &step3_7); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // step 4 - { - const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); - const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); - const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); - const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); - step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // step 5 - { - step1_0 = ADD_EPI16(step3_0, step2_1); - step1_1 = SUB_EPI16(step3_0, step2_1); - step1_2 = ADD_EPI16(step3_3, step2_2); - step1_3 = SUB_EPI16(step3_3, step2_2); - step1_4 = SUB_EPI16(step3_4, step2_5); - step1_5 = ADD_EPI16(step3_4, step2_5); - step1_6 = SUB_EPI16(step3_7, step2_6); - step1_7 = ADD_EPI16(step3_7, step2_6); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3, - &step1_4, &step1_5, &step1_6, &step1_7); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // step 6 - { - const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); - const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); - const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); - const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); - res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); - const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); - const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); - const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); - res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - } - // Transpose the results, do it as two 8x8 transposes. - transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05, - &res06, &res07, pass, out0, out1); - transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13, - &res14, &res15, pass, out0 + 8, out1 + 8); - if (pass == 0) { - out0 += 8 * 16; - } else { - out1 += 8 * 16; - } - } - // Setup in/out for next pass. - in = intermediate; - } -} - -#undef ADD_EPI16 -#undef SUB_EPI16
diff --git a/av1/common/x86/av1_fwd_txfm_sse2.c b/av1/common/x86/av1_fwd_txfm_sse2.c deleted file mode 100644 index 081fe08..0000000 --- a/av1/common/x86/av1_fwd_txfm_sse2.c +++ /dev/null
@@ -1,272 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> // SSE2 - -#include "./aom_config.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/x86/fwd_txfm_sse2.h" - -void av1_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { - __m128i in0, in1; - __m128i tmp; - const __m128i zero = _mm_setzero_si128(); - in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in1 = _mm_unpacklo_epi64( - in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); - in0 = _mm_unpacklo_epi64( - in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); - - tmp = _mm_add_epi16(in0, in1); - in0 = _mm_unpacklo_epi16(zero, tmp); - in1 = _mm_unpackhi_epi16(zero, tmp); - in0 = _mm_srai_epi32(in0, 16); - in1 = _mm_srai_epi32(in1, 16); - - tmp = _mm_add_epi32(in0, in1); - in0 = _mm_unpacklo_epi32(tmp, zero); - in1 = _mm_unpackhi_epi32(tmp, zero); - - tmp = _mm_add_epi32(in0, in1); - in0 = _mm_srli_si128(tmp, 8); - - in1 = _mm_add_epi32(tmp, in0); - in0 = _mm_slli_epi32(in1, 1); - store_output(&in0, output); -} - -void av1_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { - __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); - __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); - __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); - __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); - __m128i u0, u1, sum; - - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - - in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); - in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); - in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); - in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); - - sum = _mm_add_epi16(u0, u1); - - in0 = _mm_add_epi16(in0, in1); - in2 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, in0); - - u0 = _mm_setzero_si128(); - sum = _mm_add_epi16(sum, in2); - - in0 = _mm_unpacklo_epi16(u0, sum); - in1 = _mm_unpackhi_epi16(u0, sum); - in0 = _mm_srai_epi32(in0, 16); - in1 = _mm_srai_epi32(in1, 16); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_unpacklo_epi32(sum, u0); - in1 = _mm_unpackhi_epi32(sum, u0); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_srli_si128(sum, 8); - - in1 = _mm_add_epi32(sum, in0); - store_output(&in1, output); -} - -void av1_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, - int stride) { - __m128i in0, in1, in2, in3; - __m128i u0, u1; - __m128i sum = _mm_setzero_si128(); - int i; - - for (i = 0; i < 2; ++i) { - input += 8 * i; - in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); - in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); - in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); - in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); - - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); - in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); - in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); - in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); - - sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - in0 = _mm_load_si128((const __m128i *)(input + 8 * stride)); - in1 = _mm_load_si128((const __m128i *)(input + 9 * stride)); - in2 = _mm_load_si128((const __m128i *)(input + 10 * stride)); - in3 = _mm_load_si128((const __m128i *)(input + 11 * stride)); - - sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - in0 = _mm_load_si128((const __m128i *)(input + 12 * stride)); - in1 = _mm_load_si128((const __m128i *)(input + 13 * stride)); - in2 = _mm_load_si128((const __m128i *)(input + 14 * stride)); - in3 = _mm_load_si128((const __m128i *)(input + 15 * stride)); - - sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - sum = _mm_add_epi16(sum, u1); - } - - u0 = _mm_setzero_si128(); - in0 = _mm_unpacklo_epi16(u0, sum); - in1 = _mm_unpackhi_epi16(u0, sum); - in0 = _mm_srai_epi32(in0, 16); - in1 = _mm_srai_epi32(in1, 16); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_unpacklo_epi32(sum, u0); - in1 = _mm_unpackhi_epi32(sum, u0); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_srli_si128(sum, 8); - - in1 = _mm_add_epi32(sum, in0); - in1 = _mm_srai_epi32(in1, 1); - store_output(&in1, output); -} - -void av1_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, - int stride) { - __m128i in0, in1, in2, in3; - __m128i u0, u1; - __m128i sum = _mm_setzero_si128(); - int i; - - for (i = 0; i < 8; ++i) { - in0 = _mm_load_si128((const __m128i *)(input + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 16)); - in3 = _mm_load_si128((const __m128i *)(input + 24)); - - input += stride; - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - in0 = _mm_load_si128((const __m128i *)(input + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 16)); - in3 = _mm_load_si128((const __m128i *)(input + 24)); - - input += stride; - sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - in0 = _mm_load_si128((const __m128i *)(input + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 16)); - in3 = _mm_load_si128((const __m128i *)(input + 24)); - - input += stride; - sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - in0 = _mm_load_si128((const __m128i *)(input + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 16)); - in3 = _mm_load_si128((const __m128i *)(input + 24)); - - input += stride; - sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - sum = _mm_add_epi16(sum, u1); - } - - u0 = _mm_setzero_si128(); - in0 = _mm_unpacklo_epi16(u0, sum); - in1 = _mm_unpackhi_epi16(u0, sum); - in0 = _mm_srai_epi32(in0, 16); - in1 = _mm_srai_epi32(in1, 16); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_unpacklo_epi32(sum, u0); - in1 = _mm_unpackhi_epi32(sum, u0); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_srli_si128(sum, 8); - - in1 = _mm_add_epi32(sum, in0); - in1 = _mm_srai_epi32(in1, 3); - store_output(&in1, output); -} - -#define DCT_HIGH_BIT_DEPTH 0 -#define FDCT4x4_2D av1_fdct4x4_sse2 -#define FDCT8x8_2D av1_fdct8x8_sse2 -#define FDCT16x16_2D av1_fdct16x16_sse2 -#include "av1/common/x86/av1_fwd_txfm_impl_sse2.h" -#undef FDCT4x4_2D -#undef FDCT8x8_2D -#undef FDCT16x16_2D - -#define FDCT32x32_2D av1_fdct32x32_rd_sse2 -#define FDCT32x32_HIGH_PRECISION 0 -#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h" -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION - -#define FDCT32x32_2D av1_fdct32x32_sse2 -#define FDCT32x32_HIGH_PRECISION 1 -#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h" // NOLINT -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION -#undef DCT_HIGH_BIT_DEPTH - -#if CONFIG_AOM_HIGHBITDEPTH -#define DCT_HIGH_BIT_DEPTH 1 -#define FDCT4x4_2D av1_highbd_fdct4x4_sse2 -#define FDCT8x8_2D av1_highbd_fdct8x8_sse2 -#define FDCT16x16_2D av1_highbd_fdct16x16_sse2 -#include "av1/common/x86/av1_fwd_txfm_impl_sse2.h" // NOLINT -#undef FDCT4x4_2D -#undef FDCT8x8_2D -#undef FDCT16x16_2D - -#define FDCT32x32_2D av1_highbd_fdct32x32_rd_sse2 -#define FDCT32x32_HIGH_PRECISION 0 -#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h" // NOLINT -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION - -#define FDCT32x32_2D av1_highbd_fdct32x32_sse2 -#define FDCT32x32_HIGH_PRECISION 1 -#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h" // NOLINT -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION -#undef DCT_HIGH_BIT_DEPTH -#endif // CONFIG_AOM_HIGHBITDEPTH
diff --git a/av1/common/x86/av1_inv_txfm_sse2.c b/av1/common/x86/av1_inv_txfm_sse2.c deleted file mode 100644 index 365c124..0000000 --- a/av1/common/x86/av1_inv_txfm_sse2.c +++ /dev/null
@@ -1,4028 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "./av1_rtcd.h" -#include "av1/common/x86/av1_inv_txfm_sse2.h" -#include "aom_dsp/x86/txfm_common_sse2.h" - -#define RECON_AND_STORE4X4(dest, in_x) \ - { \ - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - *(int *)(dest) = _mm_cvtsi128_si32(d0); \ - } - -void av1_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i eight = _mm_set1_epi16(8); - const __m128i cst = _mm_setr_epi16( - (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, - (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, - (int16_t)cospi_8_64, (int16_t)cospi_24_64); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i input0, input1, input2, input3; - - // Rows - input0 = _mm_load_si128((const __m128i *)input); - input2 = _mm_load_si128((const __m128i *)(input + 8)); - - // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_shufflelo_epi16(input0, 0xd8); - input0 = _mm_shufflehi_epi16(input0, 0xd8); - input2 = _mm_shufflelo_epi16(input2, 0xd8); - input2 = _mm_shufflehi_epi16(input2, 0xd8); - - input1 = _mm_unpackhi_epi32(input0, input0); - input0 = _mm_unpacklo_epi32(input0, input0); - input3 = _mm_unpackhi_epi32(input2, input2); - input2 = _mm_unpacklo_epi32(input2, input2); - - // Stage 1 - input0 = _mm_madd_epi16(input0, cst); - input1 = _mm_madd_epi16(input1, cst); - input2 = _mm_madd_epi16(input2, cst); - input3 = _mm_madd_epi16(input3, cst); - - input0 = _mm_add_epi32(input0, rounding); - input1 = _mm_add_epi32(input1, rounding); - input2 = _mm_add_epi32(input2, rounding); - input3 = _mm_add_epi32(input3, rounding); - - input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); - input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); - input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); - input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); - - // Stage 2 - input0 = _mm_packs_epi32(input0, input1); - input1 = _mm_packs_epi32(input2, input3); - - // Transpose - input2 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpackhi_epi16(input0, input1); - input0 = _mm_unpacklo_epi32(input2, input3); - input1 = _mm_unpackhi_epi32(input2, input3); - - // Switch column2, column 3, and then, we got: - // input2: column1, column 0; input3: column2, column 3. - input1 = _mm_shuffle_epi32(input1, 0x4e); - input2 = _mm_add_epi16(input0, input1); - input3 = _mm_sub_epi16(input0, input1); - - // Columns - // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_unpacklo_epi32(input2, input2); - input1 = _mm_unpackhi_epi32(input2, input2); - input2 = _mm_unpackhi_epi32(input3, input3); - input3 = _mm_unpacklo_epi32(input3, input3); - - // Stage 1 - input0 = _mm_madd_epi16(input0, cst); - input1 = _mm_madd_epi16(input1, cst); - input2 = _mm_madd_epi16(input2, cst); - input3 = _mm_madd_epi16(input3, cst); - - input0 = _mm_add_epi32(input0, rounding); - input1 = _mm_add_epi32(input1, rounding); - input2 = _mm_add_epi32(input2, rounding); - input3 = _mm_add_epi32(input3, rounding); - - input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); - input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); - input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); - input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); - - // Stage 2 - input0 = _mm_packs_epi32(input0, input2); - input1 = _mm_packs_epi32(input1, input3); - - // Transpose - input2 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpackhi_epi16(input0, input1); - input0 = _mm_unpacklo_epi32(input2, input3); - input1 = _mm_unpackhi_epi32(input2, input3); - - // Switch column2, column 3, and then, we got: - // input2: column1, column 0; input3: column2, column 3. - input1 = _mm_shuffle_epi32(input1, 0x4e); - input2 = _mm_add_epi16(input0, input1); - input3 = _mm_sub_epi16(input0, input1); - - // Final round and shift - input2 = _mm_add_epi16(input2, eight); - input3 = _mm_add_epi16(input3, eight); - - input2 = _mm_srai_epi16(input2, 4); - input3 = _mm_srai_epi16(input3, 4); - - // Reconstruction and Store - { - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); - __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi32(d0, - _mm_cvtsi32_si128(*(const int *)(dest + stride))); - d2 = _mm_unpacklo_epi32( - _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2); - d0 = _mm_unpacklo_epi8(d0, zero); - d2 = _mm_unpacklo_epi8(d2, zero); - d0 = _mm_add_epi16(d0, input2); - d2 = _mm_add_epi16(d2, input3); - d0 = _mm_packus_epi16(d0, d2); - // store input0 - *(int *)dest = _mm_cvtsi128_si32(d0); - // store input1 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); - // store input2 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); - // store input3 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); - } -} - -void av1_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 4); - - dc_value = _mm_set1_epi16(a); - - RECON_AND_STORE4X4(dest + 0 * stride, dc_value); - RECON_AND_STORE4X4(dest + 1 * stride, dc_value); - RECON_AND_STORE4X4(dest + 2 * stride, dc_value); - RECON_AND_STORE4X4(dest + 3 * stride, dc_value); -} - -static INLINE void transpose_4x4(__m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); - const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); - - res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); - res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); -} - -void av1_idct4_sse2(__m128i *in) { - const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8]; - - transpose_4x4(in); - // stage 1 - u[0] = _mm_unpacklo_epi16(in[0], in[1]); - u[1] = _mm_unpackhi_epi16(in[0], in[1]); - v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); - v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - - u[0] = _mm_packs_epi32(v[0], v[1]); - u[1] = _mm_packs_epi32(v[3], v[2]); - - // stage 2 - in[0] = _mm_add_epi16(u[0], u[1]); - in[1] = _mm_sub_epi16(u[0], u[1]); - in[1] = _mm_shuffle_epi32(in[1], 0x4E); -} - -void av1_iadst4_sse2(__m128i *in) { - const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); - const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); - const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); - const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); - const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); - const __m128i kZero = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8], in7; - - transpose_4x4(in); - in7 = _mm_srli_si128(in[1], 8); - in7 = _mm_add_epi16(in7, in[0]); - in7 = _mm_sub_epi16(in7, in[1]); - - u[0] = _mm_unpacklo_epi16(in[0], in[1]); - u[1] = _mm_unpackhi_epi16(in[0], in[1]); - u[2] = _mm_unpacklo_epi16(in7, kZero); - u[3] = _mm_unpackhi_epi16(in[0], kZero); - - v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 - v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 - v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 - v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 - v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 - v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 - - u[0] = _mm_add_epi32(v[0], v[1]); - u[1] = _mm_add_epi32(v[3], v[4]); - u[2] = v[2]; - u[3] = _mm_add_epi32(u[0], u[1]); - u[4] = _mm_slli_epi32(v[5], 2); - u[5] = _mm_add_epi32(u[3], v[5]); - u[6] = _mm_sub_epi32(u[5], u[4]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[1]); - in[1] = _mm_packs_epi32(u[2], u[3]); -} - -#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ - const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ - const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ - const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ - const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ - out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ - out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ - out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ - } - -#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \ - { \ - const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ - const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - } - -#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - } - -// Define Macro for multiplying elements by constants and adding them together. -#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \ - res0, res1, res2, res3) \ - { \ - tmp0 = _mm_madd_epi16(lo_0, cst0); \ - tmp1 = _mm_madd_epi16(hi_0, cst0); \ - tmp2 = _mm_madd_epi16(lo_0, cst1); \ - tmp3 = _mm_madd_epi16(hi_0, cst1); \ - tmp4 = _mm_madd_epi16(lo_1, cst2); \ - tmp5 = _mm_madd_epi16(hi_1, cst2); \ - tmp6 = _mm_madd_epi16(lo_1, cst3); \ - tmp7 = _mm_madd_epi16(hi_1, cst3); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp4 = _mm_add_epi32(tmp4, rounding); \ - tmp5 = _mm_add_epi32(tmp5, rounding); \ - tmp6 = _mm_add_epi32(tmp6, rounding); \ - tmp7 = _mm_add_epi32(tmp7, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ - \ - res0 = _mm_packs_epi32(tmp0, tmp1); \ - res1 = _mm_packs_epi32(tmp2, tmp3); \ - res2 = _mm_packs_epi32(tmp4, tmp5); \ - res3 = _mm_packs_epi32(tmp6, tmp7); \ - } - -#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ - { \ - tmp0 = _mm_madd_epi16(lo_0, cst0); \ - tmp1 = _mm_madd_epi16(hi_0, cst0); \ - tmp2 = _mm_madd_epi16(lo_0, cst1); \ - tmp3 = _mm_madd_epi16(hi_0, cst1); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - res0 = _mm_packs_epi32(tmp0, tmp1); \ - res1 = _mm_packs_epi32(tmp2, tmp3); \ - } - -#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \ - out4, out5, out6, out7) \ - { \ - /* Stage1 */ \ - { \ - const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ - const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ - const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ - const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ - \ - MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1, \ - stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6) \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ - const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ - const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ - const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ - \ - MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \ - stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ - tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ - tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ - tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - } \ - \ - /* Stage4 */ \ - out0 = _mm_adds_epi16(stp1_0, stp2_7); \ - out1 = _mm_adds_epi16(stp1_1, stp1_6); \ - out2 = _mm_adds_epi16(stp1_2, stp1_5); \ - out3 = _mm_adds_epi16(stp1_3, stp2_4); \ - out4 = _mm_subs_epi16(stp1_3, stp2_4); \ - out5 = _mm_subs_epi16(stp1_2, stp1_5); \ - out6 = _mm_subs_epi16(stp1_1, stp1_6); \ - out7 = _mm_subs_epi16(stp1_0, stp2_7); \ - } - -void av1_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - - // Load input data. - in0 = _mm_load_si128((const __m128i *)input); - in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); - in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); - in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); - in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); - in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); - in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); - in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); - - // 2-D - for (i = 0; i < 2; i++) { - // 8x8 Transpose is copied from av1_fdct8x8_sse2() - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - - // 4-stage 1D av1_idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, - in6, in7); - } - - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); - - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); -} - -void av1_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 5); - - dc_value = _mm_set1_epi16(a); - - RECON_AND_STORE(dest + 0 * stride, dc_value); - RECON_AND_STORE(dest + 1 * stride, dc_value); - RECON_AND_STORE(dest + 2 * stride, dc_value); - RECON_AND_STORE(dest + 3 * stride, dc_value); - RECON_AND_STORE(dest + 4 * stride, dc_value); - RECON_AND_STORE(dest + 5 * stride, dc_value); - RECON_AND_STORE(dest + 6 * stride, dc_value); - RECON_AND_STORE(dest + 7 * stride, dc_value); -} - -void av1_idct8_sse2(__m128i *in) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - // 8x8 Transpose is copied from av1_fdct8x8_sse2() - TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0, - in1, in2, in3, in4, in5, in6, in7); - - // 4-stage 1D av1_idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3], - in[4], in[5], in[6], in[7]); -} - -void av1_iadst8_sse2(__m128i *in) { - const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__const_0 = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; - __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; - __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - - // transpose - array_transpose_8x8(in, in); - - // properly aligned for butterfly input - in0 = in[7]; - in1 = in[0]; - in2 = in[5]; - in3 = in[2]; - in4 = in[3]; - in5 = in[4]; - in6 = in[1]; - in7 = in[6]; - - // column transformation - // stage 1 - // interleave and multiply/add into 32-bit integer - s0 = _mm_unpacklo_epi16(in0, in1); - s1 = _mm_unpackhi_epi16(in0, in1); - s2 = _mm_unpacklo_epi16(in2, in3); - s3 = _mm_unpackhi_epi16(in2, in3); - s4 = _mm_unpacklo_epi16(in4, in5); - s5 = _mm_unpackhi_epi16(in4, in5); - s6 = _mm_unpacklo_epi16(in6, in7); - s7 = _mm_unpackhi_epi16(in6, in7); - - u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); - u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); - u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); - u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); - u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); - u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); - u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); - u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); - u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); - u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); - u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); - u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); - u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); - u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); - u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); - u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); - - // addition - w0 = _mm_add_epi32(u0, u8); - w1 = _mm_add_epi32(u1, u9); - w2 = _mm_add_epi32(u2, u10); - w3 = _mm_add_epi32(u3, u11); - w4 = _mm_add_epi32(u4, u12); - w5 = _mm_add_epi32(u5, u13); - w6 = _mm_add_epi32(u6, u14); - w7 = _mm_add_epi32(u7, u15); - w8 = _mm_sub_epi32(u0, u8); - w9 = _mm_sub_epi32(u1, u9); - w10 = _mm_sub_epi32(u2, u10); - w11 = _mm_sub_epi32(u3, u11); - w12 = _mm_sub_epi32(u4, u12); - w13 = _mm_sub_epi32(u5, u13); - w14 = _mm_sub_epi32(u6, u14); - w15 = _mm_sub_epi32(u7, u15); - - // shift and rounding - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); - v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); - v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); - v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); - v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); - v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); - v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); - v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); - u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); - u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); - u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); - u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); - u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); - u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); - u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); - - // back to 16-bit and pack 8 integers into __m128i - in[0] = _mm_packs_epi32(u0, u1); - in[1] = _mm_packs_epi32(u2, u3); - in[2] = _mm_packs_epi32(u4, u5); - in[3] = _mm_packs_epi32(u6, u7); - in[4] = _mm_packs_epi32(u8, u9); - in[5] = _mm_packs_epi32(u10, u11); - in[6] = _mm_packs_epi32(u12, u13); - in[7] = _mm_packs_epi32(u14, u15); - - // stage 2 - s0 = _mm_add_epi16(in[0], in[2]); - s1 = _mm_add_epi16(in[1], in[3]); - s2 = _mm_sub_epi16(in[0], in[2]); - s3 = _mm_sub_epi16(in[1], in[3]); - u0 = _mm_unpacklo_epi16(in[4], in[5]); - u1 = _mm_unpackhi_epi16(in[4], in[5]); - u2 = _mm_unpacklo_epi16(in[6], in[7]); - u3 = _mm_unpackhi_epi16(in[6], in[7]); - - v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); - v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); - v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); - v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); - v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); - v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); - v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); - v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); - - w0 = _mm_add_epi32(v0, v4); - w1 = _mm_add_epi32(v1, v5); - w2 = _mm_add_epi32(v2, v6); - w3 = _mm_add_epi32(v3, v7); - w4 = _mm_sub_epi32(v0, v4); - w5 = _mm_sub_epi32(v1, v5); - w6 = _mm_sub_epi32(v2, v6); - w7 = _mm_sub_epi32(v3, v7); - - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - - // back to 16-bit intergers - s4 = _mm_packs_epi32(u0, u1); - s5 = _mm_packs_epi32(u2, u3); - s6 = _mm_packs_epi32(u4, u5); - s7 = _mm_packs_epi32(u6, u7); - - // stage 3 - u0 = _mm_unpacklo_epi16(s2, s3); - u1 = _mm_unpackhi_epi16(s2, s3); - u2 = _mm_unpacklo_epi16(s6, s7); - u3 = _mm_unpackhi_epi16(s6, s7); - - v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); - v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); - v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); - v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); - v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); - v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); - v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); - v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); - - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - s2 = _mm_packs_epi32(v0, v1); - s3 = _mm_packs_epi32(v2, v3); - s6 = _mm_packs_epi32(v4, v5); - s7 = _mm_packs_epi32(v6, v7); - - in[0] = s0; - in[1] = _mm_sub_epi16(k__const_0, s4); - in[2] = s6; - in[3] = _mm_sub_epi16(k__const_0, s2); - in[4] = s3; - in[5] = _mm_sub_epi16(k__const_0, s7); - in[6] = s5; - in[7] = _mm_sub_epi16(k__const_0, s1); -} - -void av1_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - // Rows. Load 4-row input data. - in0 = _mm_load_si128((const __m128i *)input); - in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); - in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); - in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); - - // 8x4 Transpose - TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); - // Stage1 - { - const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); - const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); - - tmp0 = _mm_madd_epi16(lo_17, stg1_0); - tmp2 = _mm_madd_epi16(lo_17, stg1_1); - tmp4 = _mm_madd_epi16(lo_35, stg1_2); - tmp6 = _mm_madd_epi16(lo_35, stg1_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp1_4 = _mm_packs_epi32(tmp0, tmp2); - stp1_5 = _mm_packs_epi32(tmp4, tmp6); - } - - // Stage2 - { - const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); - const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); - - tmp0 = _mm_madd_epi16(lo_04, stg2_0); - tmp2 = _mm_madd_epi16(lo_04, stg2_1); - tmp4 = _mm_madd_epi16(lo_26, stg2_2); - tmp6 = _mm_madd_epi16(lo_26, stg2_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp2_0 = _mm_packs_epi32(tmp0, tmp2); - stp2_2 = _mm_packs_epi32(tmp6, tmp4); - - tmp0 = _mm_adds_epi16(stp1_4, stp1_5); - tmp1 = _mm_subs_epi16(stp1_4, stp1_5); - - stp2_4 = tmp0; - stp2_5 = _mm_unpacklo_epi64(tmp1, zero); - stp2_6 = _mm_unpackhi_epi64(tmp1, zero); - } - - // Stage3 - { - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); - - tmp4 = _mm_adds_epi16(stp2_0, stp2_2); - tmp6 = _mm_subs_epi16(stp2_0, stp2_2); - - stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); - stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); - - tmp0 = _mm_madd_epi16(lo_56, stg3_0); - tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - - stp1_5 = _mm_packs_epi32(tmp0, tmp2); - } - - // Stage4 - tmp0 = _mm_adds_epi16(stp1_3, stp2_4); - tmp1 = _mm_adds_epi16(stp1_2, stp1_5); - tmp2 = _mm_subs_epi16(stp1_3, stp2_4); - tmp3 = _mm_subs_epi16(stp1_2, stp1_5); - - TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) - - IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4, - in5, in6, in7); - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); - - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); -} - -#define IDCT16 \ - /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ - const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ - const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ - const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ - const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1, \ - stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) \ - \ - MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \ - stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ - const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ - const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \ - stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) \ - \ - stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - \ - stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ - const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ - const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1, \ - stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - } - -#define IDCT16_10 \ - /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \ - stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11, \ - stp1_12_0) \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ - \ - MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \ - \ - stp1_9 = stp1_8_0; \ - stp1_10 = stp1_11; \ - \ - stp1_13 = stp1_12_0; \ - stp1_14 = stp1_15; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1) \ - stp2_5 = stp2_4; \ - stp2_6 = stp2_7; \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_2 = stp1_1; \ - stp1_3 = stp1_0; \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - } - -void av1_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[16], l[16], r[16], *curr1; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_8_0, stp1_12_0; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - - curr1 = l; - for (i = 0; i < 2; i++) { - // 1-D av1_idct - - // Load input data. - in[0] = _mm_load_si128((const __m128i *)input); - in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1)); - in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); - in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3)); - in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); - in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5)); - in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); - in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7)); - in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8)); - in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9)); - in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10)); - in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11)); - in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12)); - in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13)); - in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14)); - in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15)); - - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - - IDCT16 - - // Stage7 - curr1[0] = _mm_add_epi16(stp2_0, stp1_15); - curr1[1] = _mm_add_epi16(stp2_1, stp1_14); - curr1[2] = _mm_add_epi16(stp2_2, stp2_13); - curr1[3] = _mm_add_epi16(stp2_3, stp2_12); - curr1[4] = _mm_add_epi16(stp2_4, stp2_11); - curr1[5] = _mm_add_epi16(stp2_5, stp2_10); - curr1[6] = _mm_add_epi16(stp2_6, stp1_9); - curr1[7] = _mm_add_epi16(stp2_7, stp1_8); - curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); - curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); - curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); - curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); - curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); - curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); - curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); - curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); - - curr1 = r; - input += 128; - } - for (i = 0; i < 2; i++) { - int j; - // 1-D av1_idct - array_transpose_8x8(l + i * 8, in); - array_transpose_8x8(r + i * 8, in + 8); - - IDCT16 - - // 2-D - in[0] = _mm_add_epi16(stp2_0, stp1_15); - in[1] = _mm_add_epi16(stp2_1, stp1_14); - in[2] = _mm_add_epi16(stp2_2, stp2_13); - in[3] = _mm_add_epi16(stp2_3, stp2_12); - in[4] = _mm_add_epi16(stp2_4, stp2_11); - in[5] = _mm_add_epi16(stp2_5, stp2_10); - in[6] = _mm_add_epi16(stp2_6, stp1_9); - in[7] = _mm_add_epi16(stp2_7, stp1_8); - in[8] = _mm_sub_epi16(stp2_7, stp1_8); - in[9] = _mm_sub_epi16(stp2_6, stp1_9); - in[10] = _mm_sub_epi16(stp2_5, stp2_10); - in[11] = _mm_sub_epi16(stp2_4, stp2_11); - in[12] = _mm_sub_epi16(stp2_3, stp2_12); - in[13] = _mm_sub_epi16(stp2_2, stp2_13); - in[14] = _mm_sub_epi16(stp2_1, stp1_14); - in[15] = _mm_sub_epi16(stp2_0, stp1_15); - - for (j = 0; j < 16; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -void av1_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a, i; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 6); - - dc_value = _mm_set1_epi16(a); - - for (i = 0; i < 2; ++i) { - RECON_AND_STORE(dest + 0 * stride, dc_value); - RECON_AND_STORE(dest + 1 * stride, dc_value); - RECON_AND_STORE(dest + 2 * stride, dc_value); - RECON_AND_STORE(dest + 3 * stride, dc_value); - RECON_AND_STORE(dest + 4 * stride, dc_value); - RECON_AND_STORE(dest + 5 * stride, dc_value); - RECON_AND_STORE(dest + 6 * stride, dc_value); - RECON_AND_STORE(dest + 7 * stride, dc_value); - RECON_AND_STORE(dest + 8 * stride, dc_value); - RECON_AND_STORE(dest + 9 * stride, dc_value); - RECON_AND_STORE(dest + 10 * stride, dc_value); - RECON_AND_STORE(dest + 11 * stride, dc_value); - RECON_AND_STORE(dest + 12 * stride, dc_value); - RECON_AND_STORE(dest + 13 * stride, dc_value); - RECON_AND_STORE(dest + 14 * stride, dc_value); - RECON_AND_STORE(dest + 15 * stride, dc_value); - dest += 8; - } -} - -static void av1_iadst16_8col(__m128i *in) { - // perform 16x16 1-D ADST for 8 columns - __m128i s[16], x[16], u[32], v[32]; - const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); - const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); - const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); - - u[0] = _mm_unpacklo_epi16(in[15], in[0]); - u[1] = _mm_unpackhi_epi16(in[15], in[0]); - u[2] = _mm_unpacklo_epi16(in[13], in[2]); - u[3] = _mm_unpackhi_epi16(in[13], in[2]); - u[4] = _mm_unpacklo_epi16(in[11], in[4]); - u[5] = _mm_unpackhi_epi16(in[11], in[4]); - u[6] = _mm_unpacklo_epi16(in[9], in[6]); - u[7] = _mm_unpackhi_epi16(in[9], in[6]); - u[8] = _mm_unpacklo_epi16(in[7], in[8]); - u[9] = _mm_unpackhi_epi16(in[7], in[8]); - u[10] = _mm_unpacklo_epi16(in[5], in[10]); - u[11] = _mm_unpackhi_epi16(in[5], in[10]); - u[12] = _mm_unpacklo_epi16(in[3], in[12]); - u[13] = _mm_unpackhi_epi16(in[3], in[12]); - u[14] = _mm_unpacklo_epi16(in[1], in[14]); - u[15] = _mm_unpackhi_epi16(in[1], in[14]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); - v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); - v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); - v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); - v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); - v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); - v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); - v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); - v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); - v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); - v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); - v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); - v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); - v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); - v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); - v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); - v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); - v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); - v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); - v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); - v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); - v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); - v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); - v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); - v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); - v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); - v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); - v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); - v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); - v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); - v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); - v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); - - u[0] = _mm_add_epi32(v[0], v[16]); - u[1] = _mm_add_epi32(v[1], v[17]); - u[2] = _mm_add_epi32(v[2], v[18]); - u[3] = _mm_add_epi32(v[3], v[19]); - u[4] = _mm_add_epi32(v[4], v[20]); - u[5] = _mm_add_epi32(v[5], v[21]); - u[6] = _mm_add_epi32(v[6], v[22]); - u[7] = _mm_add_epi32(v[7], v[23]); - u[8] = _mm_add_epi32(v[8], v[24]); - u[9] = _mm_add_epi32(v[9], v[25]); - u[10] = _mm_add_epi32(v[10], v[26]); - u[11] = _mm_add_epi32(v[11], v[27]); - u[12] = _mm_add_epi32(v[12], v[28]); - u[13] = _mm_add_epi32(v[13], v[29]); - u[14] = _mm_add_epi32(v[14], v[30]); - u[15] = _mm_add_epi32(v[15], v[31]); - u[16] = _mm_sub_epi32(v[0], v[16]); - u[17] = _mm_sub_epi32(v[1], v[17]); - u[18] = _mm_sub_epi32(v[2], v[18]); - u[19] = _mm_sub_epi32(v[3], v[19]); - u[20] = _mm_sub_epi32(v[4], v[20]); - u[21] = _mm_sub_epi32(v[5], v[21]); - u[22] = _mm_sub_epi32(v[6], v[22]); - u[23] = _mm_sub_epi32(v[7], v[23]); - u[24] = _mm_sub_epi32(v[8], v[24]); - u[25] = _mm_sub_epi32(v[9], v[25]); - u[26] = _mm_sub_epi32(v[10], v[26]); - u[27] = _mm_sub_epi32(v[11], v[27]); - u[28] = _mm_sub_epi32(v[12], v[28]); - u[29] = _mm_sub_epi32(v[13], v[29]); - u[30] = _mm_sub_epi32(v[14], v[30]); - u[31] = _mm_sub_epi32(v[15], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); - v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); - v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); - v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); - v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); - v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); - v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); - v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); - v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); - v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); - v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); - v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); - v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); - v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); - v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); - v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); - u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); - u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); - u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); - u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); - u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); - u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); - u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); - u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); - u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); - u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); - u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); - u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); - u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); - u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); - u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); - - s[0] = _mm_packs_epi32(u[0], u[1]); - s[1] = _mm_packs_epi32(u[2], u[3]); - s[2] = _mm_packs_epi32(u[4], u[5]); - s[3] = _mm_packs_epi32(u[6], u[7]); - s[4] = _mm_packs_epi32(u[8], u[9]); - s[5] = _mm_packs_epi32(u[10], u[11]); - s[6] = _mm_packs_epi32(u[12], u[13]); - s[7] = _mm_packs_epi32(u[14], u[15]); - s[8] = _mm_packs_epi32(u[16], u[17]); - s[9] = _mm_packs_epi32(u[18], u[19]); - s[10] = _mm_packs_epi32(u[20], u[21]); - s[11] = _mm_packs_epi32(u[22], u[23]); - s[12] = _mm_packs_epi32(u[24], u[25]); - s[13] = _mm_packs_epi32(u[26], u[27]); - s[14] = _mm_packs_epi32(u[28], u[29]); - s[15] = _mm_packs_epi32(u[30], u[31]); - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[8], s[9]); - u[1] = _mm_unpackhi_epi16(s[8], s[9]); - u[2] = _mm_unpacklo_epi16(s[10], s[11]); - u[3] = _mm_unpackhi_epi16(s[10], s[11]); - u[4] = _mm_unpacklo_epi16(s[12], s[13]); - u[5] = _mm_unpackhi_epi16(s[12], s[13]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); - v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); - v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); - v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); - v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); - v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); - v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); - v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); - v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); - v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); - v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); - v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); - v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); - v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); - v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); - v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); - - u[0] = _mm_add_epi32(v[0], v[8]); - u[1] = _mm_add_epi32(v[1], v[9]); - u[2] = _mm_add_epi32(v[2], v[10]); - u[3] = _mm_add_epi32(v[3], v[11]); - u[4] = _mm_add_epi32(v[4], v[12]); - u[5] = _mm_add_epi32(v[5], v[13]); - u[6] = _mm_add_epi32(v[6], v[14]); - u[7] = _mm_add_epi32(v[7], v[15]); - u[8] = _mm_sub_epi32(v[0], v[8]); - u[9] = _mm_sub_epi32(v[1], v[9]); - u[10] = _mm_sub_epi32(v[2], v[10]); - u[11] = _mm_sub_epi32(v[3], v[11]); - u[12] = _mm_sub_epi32(v[4], v[12]); - u[13] = _mm_sub_epi32(v[5], v[13]); - u[14] = _mm_sub_epi32(v[6], v[14]); - u[15] = _mm_sub_epi32(v[7], v[15]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - x[0] = _mm_add_epi16(s[0], s[4]); - x[1] = _mm_add_epi16(s[1], s[5]); - x[2] = _mm_add_epi16(s[2], s[6]); - x[3] = _mm_add_epi16(s[3], s[7]); - x[4] = _mm_sub_epi16(s[0], s[4]); - x[5] = _mm_sub_epi16(s[1], s[5]); - x[6] = _mm_sub_epi16(s[2], s[6]); - x[7] = _mm_sub_epi16(s[3], s[7]); - x[8] = _mm_packs_epi32(u[0], u[1]); - x[9] = _mm_packs_epi32(u[2], u[3]); - x[10] = _mm_packs_epi32(u[4], u[5]); - x[11] = _mm_packs_epi32(u[6], u[7]); - x[12] = _mm_packs_epi32(u[8], u[9]); - x[13] = _mm_packs_epi32(u[10], u[11]); - x[14] = _mm_packs_epi32(u[12], u[13]); - x[15] = _mm_packs_epi32(u[14], u[15]); - - // stage 3 - u[0] = _mm_unpacklo_epi16(x[4], x[5]); - u[1] = _mm_unpackhi_epi16(x[4], x[5]); - u[2] = _mm_unpacklo_epi16(x[6], x[7]); - u[3] = _mm_unpackhi_epi16(x[6], x[7]); - u[4] = _mm_unpacklo_epi16(x[12], x[13]); - u[5] = _mm_unpackhi_epi16(x[12], x[13]); - u[6] = _mm_unpacklo_epi16(x[14], x[15]); - u[7] = _mm_unpackhi_epi16(x[14], x[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); - v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); - v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); - v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); - v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); - v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); - v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); - v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); - v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); - v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); - v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); - v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); - v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); - v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); - v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); - - u[0] = _mm_add_epi32(v[0], v[4]); - u[1] = _mm_add_epi32(v[1], v[5]); - u[2] = _mm_add_epi32(v[2], v[6]); - u[3] = _mm_add_epi32(v[3], v[7]); - u[4] = _mm_sub_epi32(v[0], v[4]); - u[5] = _mm_sub_epi32(v[1], v[5]); - u[6] = _mm_sub_epi32(v[2], v[6]); - u[7] = _mm_sub_epi32(v[3], v[7]); - u[8] = _mm_add_epi32(v[8], v[12]); - u[9] = _mm_add_epi32(v[9], v[13]); - u[10] = _mm_add_epi32(v[10], v[14]); - u[11] = _mm_add_epi32(v[11], v[15]); - u[12] = _mm_sub_epi32(v[8], v[12]); - u[13] = _mm_sub_epi32(v[9], v[13]); - u[14] = _mm_sub_epi32(v[10], v[14]); - u[15] = _mm_sub_epi32(v[11], v[15]); - - u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[0] = _mm_add_epi16(x[0], x[2]); - s[1] = _mm_add_epi16(x[1], x[3]); - s[2] = _mm_sub_epi16(x[0], x[2]); - s[3] = _mm_sub_epi16(x[1], x[3]); - s[4] = _mm_packs_epi32(v[0], v[1]); - s[5] = _mm_packs_epi32(v[2], v[3]); - s[6] = _mm_packs_epi32(v[4], v[5]); - s[7] = _mm_packs_epi32(v[6], v[7]); - s[8] = _mm_add_epi16(x[8], x[10]); - s[9] = _mm_add_epi16(x[9], x[11]); - s[10] = _mm_sub_epi16(x[8], x[10]); - s[11] = _mm_sub_epi16(x[9], x[11]); - s[12] = _mm_packs_epi32(v[8], v[9]); - s[13] = _mm_packs_epi32(v[10], v[11]); - s[14] = _mm_packs_epi32(v[12], v[13]); - s[15] = _mm_packs_epi32(v[14], v[15]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(s[2], s[3]); - u[1] = _mm_unpackhi_epi16(s[2], s[3]); - u[2] = _mm_unpacklo_epi16(s[6], s[7]); - u[3] = _mm_unpackhi_epi16(s[6], s[7]); - u[4] = _mm_unpacklo_epi16(s[10], s[11]); - u[5] = _mm_unpackhi_epi16(s[10], s[11]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); - v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); - v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); - v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); - v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); - v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); - v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); - v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - in[0] = s[0]; - in[1] = _mm_sub_epi16(kZero, s[8]); - in[2] = s[12]; - in[3] = _mm_sub_epi16(kZero, s[4]); - in[4] = _mm_packs_epi32(v[4], v[5]); - in[5] = _mm_packs_epi32(v[12], v[13]); - in[6] = _mm_packs_epi32(v[8], v[9]); - in[7] = _mm_packs_epi32(v[0], v[1]); - in[8] = _mm_packs_epi32(v[2], v[3]); - in[9] = _mm_packs_epi32(v[10], v[11]); - in[10] = _mm_packs_epi32(v[14], v[15]); - in[11] = _mm_packs_epi32(v[6], v[7]); - in[12] = s[5]; - in[13] = _mm_sub_epi16(kZero, s[13]); - in[14] = s[9]; - in[15] = _mm_sub_epi16(kZero, s[1]); -} - -static void av1_idct16_8col(__m128i *in) { - const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i v[16], u[16], s[16], t[16]; - - // stage 1 - s[0] = in[0]; - s[1] = in[8]; - s[2] = in[4]; - s[3] = in[12]; - s[4] = in[2]; - s[5] = in[10]; - s[6] = in[6]; - s[7] = in[14]; - s[8] = in[1]; - s[9] = in[9]; - s[10] = in[5]; - s[11] = in[13]; - s[12] = in[3]; - s[13] = in[11]; - s[14] = in[7]; - s[15] = in[15]; - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[8], s[15]); - u[1] = _mm_unpackhi_epi16(s[8], s[15]); - u[2] = _mm_unpacklo_epi16(s[9], s[14]); - u[3] = _mm_unpackhi_epi16(s[9], s[14]); - u[4] = _mm_unpacklo_epi16(s[10], s[13]); - u[5] = _mm_unpackhi_epi16(s[10], s[13]); - u[6] = _mm_unpacklo_epi16(s[11], s[12]); - u[7] = _mm_unpackhi_epi16(s[11], s[12]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); - v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); - v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); - v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); - v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); - v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); - v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); - v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); - v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); - v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); - v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); - v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); - v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); - v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); - v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); - v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[8] = _mm_packs_epi32(u[0], u[1]); - s[15] = _mm_packs_epi32(u[2], u[3]); - s[9] = _mm_packs_epi32(u[4], u[5]); - s[14] = _mm_packs_epi32(u[6], u[7]); - s[10] = _mm_packs_epi32(u[8], u[9]); - s[13] = _mm_packs_epi32(u[10], u[11]); - s[11] = _mm_packs_epi32(u[12], u[13]); - s[12] = _mm_packs_epi32(u[14], u[15]); - - // stage 3 - t[0] = s[0]; - t[1] = s[1]; - t[2] = s[2]; - t[3] = s[3]; - u[0] = _mm_unpacklo_epi16(s[4], s[7]); - u[1] = _mm_unpackhi_epi16(s[4], s[7]); - u[2] = _mm_unpacklo_epi16(s[5], s[6]); - u[3] = _mm_unpackhi_epi16(s[5], s[6]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); - v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); - v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); - v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); - v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); - v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); - v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); - v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - t[4] = _mm_packs_epi32(u[0], u[1]); - t[7] = _mm_packs_epi32(u[2], u[3]); - t[5] = _mm_packs_epi32(u[4], u[5]); - t[6] = _mm_packs_epi32(u[6], u[7]); - t[8] = _mm_add_epi16(s[8], s[9]); - t[9] = _mm_sub_epi16(s[8], s[9]); - t[10] = _mm_sub_epi16(s[11], s[10]); - t[11] = _mm_add_epi16(s[10], s[11]); - t[12] = _mm_add_epi16(s[12], s[13]); - t[13] = _mm_sub_epi16(s[12], s[13]); - t[14] = _mm_sub_epi16(s[15], s[14]); - t[15] = _mm_add_epi16(s[14], s[15]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(t[0], t[1]); - u[1] = _mm_unpackhi_epi16(t[0], t[1]); - u[2] = _mm_unpacklo_epi16(t[2], t[3]); - u[3] = _mm_unpackhi_epi16(t[2], t[3]); - u[4] = _mm_unpacklo_epi16(t[9], t[14]); - u[5] = _mm_unpackhi_epi16(t[9], t[14]); - u[6] = _mm_unpacklo_epi16(t[10], t[13]); - u[7] = _mm_unpackhi_epi16(t[10], t[13]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); - v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); - v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); - v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); - v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); - v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); - v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); - v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); - v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); - v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); - v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[0] = _mm_packs_epi32(u[0], u[1]); - s[1] = _mm_packs_epi32(u[2], u[3]); - s[2] = _mm_packs_epi32(u[4], u[5]); - s[3] = _mm_packs_epi32(u[6], u[7]); - s[4] = _mm_add_epi16(t[4], t[5]); - s[5] = _mm_sub_epi16(t[4], t[5]); - s[6] = _mm_sub_epi16(t[7], t[6]); - s[7] = _mm_add_epi16(t[6], t[7]); - s[8] = t[8]; - s[15] = t[15]; - s[9] = _mm_packs_epi32(u[8], u[9]); - s[14] = _mm_packs_epi32(u[10], u[11]); - s[10] = _mm_packs_epi32(u[12], u[13]); - s[13] = _mm_packs_epi32(u[14], u[15]); - s[11] = t[11]; - s[12] = t[12]; - - // stage 5 - t[0] = _mm_add_epi16(s[0], s[3]); - t[1] = _mm_add_epi16(s[1], s[2]); - t[2] = _mm_sub_epi16(s[1], s[2]); - t[3] = _mm_sub_epi16(s[0], s[3]); - t[4] = s[4]; - t[7] = s[7]; - - u[0] = _mm_unpacklo_epi16(s[5], s[6]); - u[1] = _mm_unpackhi_epi16(s[5], s[6]); - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - t[5] = _mm_packs_epi32(u[0], u[1]); - t[6] = _mm_packs_epi32(u[2], u[3]); - - t[8] = _mm_add_epi16(s[8], s[11]); - t[9] = _mm_add_epi16(s[9], s[10]); - t[10] = _mm_sub_epi16(s[9], s[10]); - t[11] = _mm_sub_epi16(s[8], s[11]); - t[12] = _mm_sub_epi16(s[15], s[12]); - t[13] = _mm_sub_epi16(s[14], s[13]); - t[14] = _mm_add_epi16(s[13], s[14]); - t[15] = _mm_add_epi16(s[12], s[15]); - - // stage 6 - s[0] = _mm_add_epi16(t[0], t[7]); - s[1] = _mm_add_epi16(t[1], t[6]); - s[2] = _mm_add_epi16(t[2], t[5]); - s[3] = _mm_add_epi16(t[3], t[4]); - s[4] = _mm_sub_epi16(t[3], t[4]); - s[5] = _mm_sub_epi16(t[2], t[5]); - s[6] = _mm_sub_epi16(t[1], t[6]); - s[7] = _mm_sub_epi16(t[0], t[7]); - s[8] = t[8]; - s[9] = t[9]; - - u[0] = _mm_unpacklo_epi16(t[10], t[13]); - u[1] = _mm_unpackhi_epi16(t[10], t[13]); - u[2] = _mm_unpacklo_epi16(t[11], t[12]); - u[3] = _mm_unpackhi_epi16(t[11], t[12]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - s[10] = _mm_packs_epi32(u[0], u[1]); - s[13] = _mm_packs_epi32(u[2], u[3]); - s[11] = _mm_packs_epi32(u[4], u[5]); - s[12] = _mm_packs_epi32(u[6], u[7]); - s[14] = t[14]; - s[15] = t[15]; - - // stage 7 - in[0] = _mm_add_epi16(s[0], s[15]); - in[1] = _mm_add_epi16(s[1], s[14]); - in[2] = _mm_add_epi16(s[2], s[13]); - in[3] = _mm_add_epi16(s[3], s[12]); - in[4] = _mm_add_epi16(s[4], s[11]); - in[5] = _mm_add_epi16(s[5], s[10]); - in[6] = _mm_add_epi16(s[6], s[9]); - in[7] = _mm_add_epi16(s[7], s[8]); - in[8] = _mm_sub_epi16(s[7], s[8]); - in[9] = _mm_sub_epi16(s[6], s[9]); - in[10] = _mm_sub_epi16(s[5], s[10]); - in[11] = _mm_sub_epi16(s[4], s[11]); - in[12] = _mm_sub_epi16(s[3], s[12]); - in[13] = _mm_sub_epi16(s[2], s[13]); - in[14] = _mm_sub_epi16(s[1], s[14]); - in[15] = _mm_sub_epi16(s[0], s[15]); -} - -void av1_idct16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); - av1_idct16_8col(in0); - av1_idct16_8col(in1); -} - -void av1_iadst16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); - av1_iadst16_8col(in0); - av1_iadst16_8col(in1); -} - -void av1_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i in[16], l[16]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8, - stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0, - stp1_12_0; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - // First 1-D inverse DCT - // Load input data. - in[0] = _mm_load_si128((const __m128i *)input); - in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); - in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); - in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); - - TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); - - // Stage2 - { - const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); - const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); - - tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); - tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); - tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); - tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - tmp7 = _mm_add_epi32(tmp7, rounding); - - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - - stp2_8 = _mm_packs_epi32(tmp0, tmp2); - stp2_11 = _mm_packs_epi32(tmp5, tmp7); - } - - // Stage3 - { - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); - - tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); - tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - - stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); - stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); - - stp1_4 = _mm_packs_epi32(tmp0, tmp2); - } - - // Stage4 - { - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); - - tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); - tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); - tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); - tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); - tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); - tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - tmp7 = _mm_add_epi32(tmp7, rounding); - - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - - stp1_0 = _mm_packs_epi32(tmp0, tmp0); - stp1_1 = _mm_packs_epi32(tmp2, tmp2); - stp2_9 = _mm_packs_epi32(tmp1, tmp3); - stp2_10 = _mm_packs_epi32(tmp5, tmp7); - - stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); - } - - // Stage5 and Stage6 - { - tmp0 = _mm_add_epi16(stp2_8, stp2_11); - tmp1 = _mm_sub_epi16(stp2_8, stp2_11); - tmp2 = _mm_add_epi16(stp2_9, stp2_10); - tmp3 = _mm_sub_epi16(stp2_9, stp2_10); - - stp1_9 = _mm_unpacklo_epi64(tmp2, zero); - stp1_10 = _mm_unpacklo_epi64(tmp3, zero); - stp1_8 = _mm_unpacklo_epi64(tmp0, zero); - stp1_11 = _mm_unpacklo_epi64(tmp1, zero); - - stp1_13 = _mm_unpackhi_epi64(tmp3, zero); - stp1_14 = _mm_unpackhi_epi64(tmp2, zero); - stp1_12 = _mm_unpackhi_epi64(tmp1, zero); - stp1_15 = _mm_unpackhi_epi64(tmp0, zero); - } - - // Stage6 - { - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); - - tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); - tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); - tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); - tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); - tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); - tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); - - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp1_6 = _mm_packs_epi32(tmp3, tmp1); - - stp2_10 = _mm_packs_epi32(tmp0, zero); - stp2_13 = _mm_packs_epi32(tmp2, zero); - stp2_11 = _mm_packs_epi32(tmp4, zero); - stp2_12 = _mm_packs_epi32(tmp6, zero); - - tmp0 = _mm_add_epi16(stp1_0, stp1_4); - tmp1 = _mm_sub_epi16(stp1_0, stp1_4); - tmp2 = _mm_add_epi16(stp1_1, stp1_6); - tmp3 = _mm_sub_epi16(stp1_1, stp1_6); - - stp2_0 = _mm_unpackhi_epi64(tmp0, zero); - stp2_1 = _mm_unpacklo_epi64(tmp2, zero); - stp2_2 = _mm_unpackhi_epi64(tmp2, zero); - stp2_3 = _mm_unpacklo_epi64(tmp0, zero); - stp2_4 = _mm_unpacklo_epi64(tmp1, zero); - stp2_5 = _mm_unpackhi_epi64(tmp3, zero); - stp2_6 = _mm_unpacklo_epi64(tmp3, zero); - stp2_7 = _mm_unpackhi_epi64(tmp1, zero); - } - - // Stage7. Left 8x16 only. - l[0] = _mm_add_epi16(stp2_0, stp1_15); - l[1] = _mm_add_epi16(stp2_1, stp1_14); - l[2] = _mm_add_epi16(stp2_2, stp2_13); - l[3] = _mm_add_epi16(stp2_3, stp2_12); - l[4] = _mm_add_epi16(stp2_4, stp2_11); - l[5] = _mm_add_epi16(stp2_5, stp2_10); - l[6] = _mm_add_epi16(stp2_6, stp1_9); - l[7] = _mm_add_epi16(stp2_7, stp1_8); - l[8] = _mm_sub_epi16(stp2_7, stp1_8); - l[9] = _mm_sub_epi16(stp2_6, stp1_9); - l[10] = _mm_sub_epi16(stp2_5, stp2_10); - l[11] = _mm_sub_epi16(stp2_4, stp2_11); - l[12] = _mm_sub_epi16(stp2_3, stp2_12); - l[13] = _mm_sub_epi16(stp2_2, stp2_13); - l[14] = _mm_sub_epi16(stp2_1, stp1_14); - l[15] = _mm_sub_epi16(stp2_0, stp1_15); - - // Second 1-D inverse transform, performed per 8x16 block - for (i = 0; i < 2; i++) { - int j; - array_transpose_4X8(l + 8 * i, in); - - IDCT16_10 - - // Stage7 - in[0] = _mm_add_epi16(stp2_0, stp1_15); - in[1] = _mm_add_epi16(stp2_1, stp1_14); - in[2] = _mm_add_epi16(stp2_2, stp2_13); - in[3] = _mm_add_epi16(stp2_3, stp2_12); - in[4] = _mm_add_epi16(stp2_4, stp2_11); - in[5] = _mm_add_epi16(stp2_5, stp2_10); - in[6] = _mm_add_epi16(stp2_6, stp1_9); - in[7] = _mm_add_epi16(stp2_7, stp1_8); - in[8] = _mm_sub_epi16(stp2_7, stp1_8); - in[9] = _mm_sub_epi16(stp2_6, stp1_9); - in[10] = _mm_sub_epi16(stp2_5, stp2_10); - in[11] = _mm_sub_epi16(stp2_4, stp2_11); - in[12] = _mm_sub_epi16(stp2_3, stp2_12); - in[13] = _mm_sub_epi16(stp2_2, stp2_13); - in[14] = _mm_sub_epi16(stp2_1, stp1_14); - in[15] = _mm_sub_epi16(stp2_0, stp1_15); - - for (j = 0; j < 16; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -#define LOAD_DQCOEFF(reg, input) \ - { \ - reg = _mm_load_si128((const __m128i *)input); \ - input += 8; \ - } - -#define IDCT32_34 \ - /* Stage1 */ \ - { \ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ - \ - const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ - \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16, \ - stp1_31); \ - MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19, \ - stp1_28); \ - MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20, \ - stp1_27); \ - MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23, \ - stp1_24); \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ - \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ - \ - MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8, \ - stp2_15); \ - MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11, \ - stp2_12); \ - \ - stp2_16 = stp1_16; \ - stp2_19 = stp1_19; \ - \ - stp2_20 = stp1_20; \ - stp2_23 = stp1_23; \ - \ - stp2_24 = stp1_24; \ - stp2_27 = stp1_27; \ - \ - stp2_28 = stp1_28; \ - stp2_31 = stp1_31; \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ - \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ - \ - MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4, \ - stp1_7); \ - \ - stp1_8 = stp2_8; \ - stp1_11 = stp2_11; \ - stp1_12 = stp2_12; \ - stp1_15 = stp2_15; \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ - stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ - stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ - \ - MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0, \ - stp2_1); \ - \ - stp2_4 = stp1_4; \ - stp2_5 = stp1_4; \ - stp2_6 = stp1_7; \ - stp2_7 = stp1_7; \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - stp1_0 = stp2_0; \ - stp1_1 = stp2_1; \ - stp1_2 = stp2_1; \ - stp1_3 = stp2_0; \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ - stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ - } \ - \ - /* Stage7 */ \ - { \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ - stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } - -#define IDCT32 \ - /* Stage1 */ \ - { \ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ - const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ - const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ - \ - const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ - const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ - const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ - const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ - const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ - \ - const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ - const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ - stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \ - stp1_30) \ - MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \ - stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \ - MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ - stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ - stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ - stp1_23, stp1_24) \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ - const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ - const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ - \ - const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ - const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ - stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ - stp2_14) \ - MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ - stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ - stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ - stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ - \ - stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ - stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ - stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ - stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ - \ - stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ - stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ - stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ - const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ - const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ - \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - \ - MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ - stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ - stp1_6) \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ - stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ - stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ - const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ - const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \ - stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ - stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ - } \ - \ - /* Stage7 */ \ - { \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ - stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } - -// Only upper-left 8x8 has non-zero coeff -void av1_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - - // av1_idct constants for each stage - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[32], col[32]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, - stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, - stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - - // Load input data. Only need to load the top left 8x8 block. - in[0] = _mm_load_si128((const __m128i *)input); - in[1] = _mm_load_si128((const __m128i *)(input + 32)); - in[2] = _mm_load_si128((const __m128i *)(input + 64)); - in[3] = _mm_load_si128((const __m128i *)(input + 96)); - in[4] = _mm_load_si128((const __m128i *)(input + 128)); - in[5] = _mm_load_si128((const __m128i *)(input + 160)); - in[6] = _mm_load_si128((const __m128i *)(input + 192)); - in[7] = _mm_load_si128((const __m128i *)(input + 224)); - - for (i = 8; i < 32; ++i) { - in[i] = _mm_setzero_si128(); - } - - array_transpose_8x8(in, in); - // TODO(hkuang): Following transposes are unnecessary. But remove them will - // lead to performance drop on some devices. - array_transpose_8x8(in + 8, in + 8); - array_transpose_8x8(in + 16, in + 16); - array_transpose_8x8(in + 24, in + 24); - - IDCT32_34 - - // 1_D: Store 32 intermediate results for each 8x32 block. - col[0] = _mm_add_epi16(stp1_0, stp1_31); - col[1] = _mm_add_epi16(stp1_1, stp1_30); - col[2] = _mm_add_epi16(stp1_2, stp1_29); - col[3] = _mm_add_epi16(stp1_3, stp1_28); - col[4] = _mm_add_epi16(stp1_4, stp1_27); - col[5] = _mm_add_epi16(stp1_5, stp1_26); - col[6] = _mm_add_epi16(stp1_6, stp1_25); - col[7] = _mm_add_epi16(stp1_7, stp1_24); - col[8] = _mm_add_epi16(stp1_8, stp1_23); - col[9] = _mm_add_epi16(stp1_9, stp1_22); - col[10] = _mm_add_epi16(stp1_10, stp1_21); - col[11] = _mm_add_epi16(stp1_11, stp1_20); - col[12] = _mm_add_epi16(stp1_12, stp1_19); - col[13] = _mm_add_epi16(stp1_13, stp1_18); - col[14] = _mm_add_epi16(stp1_14, stp1_17); - col[15] = _mm_add_epi16(stp1_15, stp1_16); - col[16] = _mm_sub_epi16(stp1_15, stp1_16); - col[17] = _mm_sub_epi16(stp1_14, stp1_17); - col[18] = _mm_sub_epi16(stp1_13, stp1_18); - col[19] = _mm_sub_epi16(stp1_12, stp1_19); - col[20] = _mm_sub_epi16(stp1_11, stp1_20); - col[21] = _mm_sub_epi16(stp1_10, stp1_21); - col[22] = _mm_sub_epi16(stp1_9, stp1_22); - col[23] = _mm_sub_epi16(stp1_8, stp1_23); - col[24] = _mm_sub_epi16(stp1_7, stp1_24); - col[25] = _mm_sub_epi16(stp1_6, stp1_25); - col[26] = _mm_sub_epi16(stp1_5, stp1_26); - col[27] = _mm_sub_epi16(stp1_4, stp1_27); - col[28] = _mm_sub_epi16(stp1_3, stp1_28); - col[29] = _mm_sub_epi16(stp1_2, stp1_29); - col[30] = _mm_sub_epi16(stp1_1, stp1_30); - col[31] = _mm_sub_epi16(stp1_0, stp1_31); - for (i = 0; i < 4; i++) { - int j; - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + i * 8, in); - IDCT32_34 - - // 2_D: Calculate the results and store them to destination. - in[0] = _mm_add_epi16(stp1_0, stp1_31); - in[1] = _mm_add_epi16(stp1_1, stp1_30); - in[2] = _mm_add_epi16(stp1_2, stp1_29); - in[3] = _mm_add_epi16(stp1_3, stp1_28); - in[4] = _mm_add_epi16(stp1_4, stp1_27); - in[5] = _mm_add_epi16(stp1_5, stp1_26); - in[6] = _mm_add_epi16(stp1_6, stp1_25); - in[7] = _mm_add_epi16(stp1_7, stp1_24); - in[8] = _mm_add_epi16(stp1_8, stp1_23); - in[9] = _mm_add_epi16(stp1_9, stp1_22); - in[10] = _mm_add_epi16(stp1_10, stp1_21); - in[11] = _mm_add_epi16(stp1_11, stp1_20); - in[12] = _mm_add_epi16(stp1_12, stp1_19); - in[13] = _mm_add_epi16(stp1_13, stp1_18); - in[14] = _mm_add_epi16(stp1_14, stp1_17); - in[15] = _mm_add_epi16(stp1_15, stp1_16); - in[16] = _mm_sub_epi16(stp1_15, stp1_16); - in[17] = _mm_sub_epi16(stp1_14, stp1_17); - in[18] = _mm_sub_epi16(stp1_13, stp1_18); - in[19] = _mm_sub_epi16(stp1_12, stp1_19); - in[20] = _mm_sub_epi16(stp1_11, stp1_20); - in[21] = _mm_sub_epi16(stp1_10, stp1_21); - in[22] = _mm_sub_epi16(stp1_9, stp1_22); - in[23] = _mm_sub_epi16(stp1_8, stp1_23); - in[24] = _mm_sub_epi16(stp1_7, stp1_24); - in[25] = _mm_sub_epi16(stp1_6, stp1_25); - in[26] = _mm_sub_epi16(stp1_5, stp1_26); - in[27] = _mm_sub_epi16(stp1_4, stp1_27); - in[28] = _mm_sub_epi16(stp1_3, stp1_28); - in[29] = _mm_sub_epi16(stp1_2, stp1_29); - in[30] = _mm_sub_epi16(stp1_1, stp1_30); - in[31] = _mm_sub_epi16(stp1_0, stp1_31); - - for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -void av1_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - // av1_idct constants for each stage - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[32], col[128], zero_idx[16]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, - stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, - stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i, j, i32; - - for (i = 0; i < 4; i++) { - i32 = (i << 5); - // First 1-D av1_idct - // Load input data. - LOAD_DQCOEFF(in[0], input); - LOAD_DQCOEFF(in[8], input); - LOAD_DQCOEFF(in[16], input); - LOAD_DQCOEFF(in[24], input); - LOAD_DQCOEFF(in[1], input); - LOAD_DQCOEFF(in[9], input); - LOAD_DQCOEFF(in[17], input); - LOAD_DQCOEFF(in[25], input); - LOAD_DQCOEFF(in[2], input); - LOAD_DQCOEFF(in[10], input); - LOAD_DQCOEFF(in[18], input); - LOAD_DQCOEFF(in[26], input); - LOAD_DQCOEFF(in[3], input); - LOAD_DQCOEFF(in[11], input); - LOAD_DQCOEFF(in[19], input); - LOAD_DQCOEFF(in[27], input); - - LOAD_DQCOEFF(in[4], input); - LOAD_DQCOEFF(in[12], input); - LOAD_DQCOEFF(in[20], input); - LOAD_DQCOEFF(in[28], input); - LOAD_DQCOEFF(in[5], input); - LOAD_DQCOEFF(in[13], input); - LOAD_DQCOEFF(in[21], input); - LOAD_DQCOEFF(in[29], input); - LOAD_DQCOEFF(in[6], input); - LOAD_DQCOEFF(in[14], input); - LOAD_DQCOEFF(in[22], input); - LOAD_DQCOEFF(in[30], input); - LOAD_DQCOEFF(in[7], input); - LOAD_DQCOEFF(in[15], input); - LOAD_DQCOEFF(in[23], input); - LOAD_DQCOEFF(in[31], input); - - // checking if all entries are zero - zero_idx[0] = _mm_or_si128(in[0], in[1]); - zero_idx[1] = _mm_or_si128(in[2], in[3]); - zero_idx[2] = _mm_or_si128(in[4], in[5]); - zero_idx[3] = _mm_or_si128(in[6], in[7]); - zero_idx[4] = _mm_or_si128(in[8], in[9]); - zero_idx[5] = _mm_or_si128(in[10], in[11]); - zero_idx[6] = _mm_or_si128(in[12], in[13]); - zero_idx[7] = _mm_or_si128(in[14], in[15]); - zero_idx[8] = _mm_or_si128(in[16], in[17]); - zero_idx[9] = _mm_or_si128(in[18], in[19]); - zero_idx[10] = _mm_or_si128(in[20], in[21]); - zero_idx[11] = _mm_or_si128(in[22], in[23]); - zero_idx[12] = _mm_or_si128(in[24], in[25]); - zero_idx[13] = _mm_or_si128(in[26], in[27]); - zero_idx[14] = _mm_or_si128(in[28], in[29]); - zero_idx[15] = _mm_or_si128(in[30], in[31]); - - zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); - zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); - zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); - zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); - zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); - zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); - zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); - zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); - - zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); - zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); - zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); - zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); - zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); - zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); - zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); - - if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { - col[i32 + 0] = _mm_setzero_si128(); - col[i32 + 1] = _mm_setzero_si128(); - col[i32 + 2] = _mm_setzero_si128(); - col[i32 + 3] = _mm_setzero_si128(); - col[i32 + 4] = _mm_setzero_si128(); - col[i32 + 5] = _mm_setzero_si128(); - col[i32 + 6] = _mm_setzero_si128(); - col[i32 + 7] = _mm_setzero_si128(); - col[i32 + 8] = _mm_setzero_si128(); - col[i32 + 9] = _mm_setzero_si128(); - col[i32 + 10] = _mm_setzero_si128(); - col[i32 + 11] = _mm_setzero_si128(); - col[i32 + 12] = _mm_setzero_si128(); - col[i32 + 13] = _mm_setzero_si128(); - col[i32 + 14] = _mm_setzero_si128(); - col[i32 + 15] = _mm_setzero_si128(); - col[i32 + 16] = _mm_setzero_si128(); - col[i32 + 17] = _mm_setzero_si128(); - col[i32 + 18] = _mm_setzero_si128(); - col[i32 + 19] = _mm_setzero_si128(); - col[i32 + 20] = _mm_setzero_si128(); - col[i32 + 21] = _mm_setzero_si128(); - col[i32 + 22] = _mm_setzero_si128(); - col[i32 + 23] = _mm_setzero_si128(); - col[i32 + 24] = _mm_setzero_si128(); - col[i32 + 25] = _mm_setzero_si128(); - col[i32 + 26] = _mm_setzero_si128(); - col[i32 + 27] = _mm_setzero_si128(); - col[i32 + 28] = _mm_setzero_si128(); - col[i32 + 29] = _mm_setzero_si128(); - col[i32 + 30] = _mm_setzero_si128(); - col[i32 + 31] = _mm_setzero_si128(); - continue; - } - - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - array_transpose_8x8(in + 16, in + 16); - array_transpose_8x8(in + 24, in + 24); - - IDCT32 - - // 1_D: Store 32 intermediate results for each 8x32 block. - col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); - col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); - col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); - col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); - col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); - col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); - col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); - col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); - col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); - col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); - col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); - col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); - col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); - col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); - col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); - col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); - col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); - col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); - col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); - col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); - col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); - col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); - col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); - col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); - col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); - col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); - col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); - col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); - col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); - col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); - col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); - col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); - } - for (i = 0; i < 4; i++) { - // Second 1-D av1_idct - j = i << 3; - - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + j, in); - array_transpose_8x8(col + j + 32, in + 8); - array_transpose_8x8(col + j + 64, in + 16); - array_transpose_8x8(col + j + 96, in + 24); - - IDCT32 - - // 2_D: Calculate the results and store them to destination. - in[0] = _mm_add_epi16(stp1_0, stp1_31); - in[1] = _mm_add_epi16(stp1_1, stp1_30); - in[2] = _mm_add_epi16(stp1_2, stp1_29); - in[3] = _mm_add_epi16(stp1_3, stp1_28); - in[4] = _mm_add_epi16(stp1_4, stp1_27); - in[5] = _mm_add_epi16(stp1_5, stp1_26); - in[6] = _mm_add_epi16(stp1_6, stp1_25); - in[7] = _mm_add_epi16(stp1_7, stp1_24); - in[8] = _mm_add_epi16(stp1_8, stp1_23); - in[9] = _mm_add_epi16(stp1_9, stp1_22); - in[10] = _mm_add_epi16(stp1_10, stp1_21); - in[11] = _mm_add_epi16(stp1_11, stp1_20); - in[12] = _mm_add_epi16(stp1_12, stp1_19); - in[13] = _mm_add_epi16(stp1_13, stp1_18); - in[14] = _mm_add_epi16(stp1_14, stp1_17); - in[15] = _mm_add_epi16(stp1_15, stp1_16); - in[16] = _mm_sub_epi16(stp1_15, stp1_16); - in[17] = _mm_sub_epi16(stp1_14, stp1_17); - in[18] = _mm_sub_epi16(stp1_13, stp1_18); - in[19] = _mm_sub_epi16(stp1_12, stp1_19); - in[20] = _mm_sub_epi16(stp1_11, stp1_20); - in[21] = _mm_sub_epi16(stp1_10, stp1_21); - in[22] = _mm_sub_epi16(stp1_9, stp1_22); - in[23] = _mm_sub_epi16(stp1_8, stp1_23); - in[24] = _mm_sub_epi16(stp1_7, stp1_24); - in[25] = _mm_sub_epi16(stp1_6, stp1_25); - in[26] = _mm_sub_epi16(stp1_5, stp1_26); - in[27] = _mm_sub_epi16(stp1_4, stp1_27); - in[28] = _mm_sub_epi16(stp1_3, stp1_28); - in[29] = _mm_sub_epi16(stp1_2, stp1_29); - in[30] = _mm_sub_epi16(stp1_1, stp1_30); - in[31] = _mm_sub_epi16(stp1_0, stp1_31); - - for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -void av1_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a, i; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 6); - - dc_value = _mm_set1_epi16(a); - - for (i = 0; i < 4; ++i) { - int j; - for (j = 0; j < 32; ++j) { - RECON_AND_STORE(dest + j * stride, dc_value); - } - dest += 8; - } -} - -#if CONFIG_AOM_HIGHBITDEPTH -static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { - __m128i ubounded, retval; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); - ubounded = _mm_cmpgt_epi16(value, max); - retval = _mm_andnot_si128(ubounded, value); - ubounded = _mm_and_si128(ubounded, max); - retval = _mm_or_si128(retval, ubounded); - retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); - return retval; -} - -void av1_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - int i, j; - __m128i inptr[4]; - __m128i sign_bits[2]; - __m128i temp_mm, min_input, max_input; - int test; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - int optimised_cols = 0; - const __m128i zero = _mm_set1_epi16(0); - const __m128i eight = _mm_set1_epi16(8); - const __m128i max = _mm_set1_epi16(12043); - const __m128i min = _mm_set1_epi16(-12043); - // Load input into __m128i - inptr[0] = _mm_loadu_si128((const __m128i *)input); - inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); - inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); - inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); - - // Pack to 16 bits - inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); - inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); - - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (!test) { - // Do the row transform - av1_idct4_sse2(inptr); - - // Check the min & max values - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (test) { - transpose_4x4(inptr); - sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); - sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); - inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); - inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); - inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); - inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); - _mm_storeu_si128((__m128i *)outptr, inptr[0]); - _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); - _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); - _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - av1_highbd_idct4_c(input, outptr, bd); - input += 4; - outptr += 4; - } - } - - if (optimised_cols) { - av1_idct4_sse2(inptr); - - // Final round and shift - inptr[0] = _mm_add_epi16(inptr[0], eight); - inptr[1] = _mm_add_epi16(inptr[1], eight); - - inptr[0] = _mm_srai_epi16(inptr[0], 4); - inptr[1] = _mm_srai_epi16(inptr[1], 4); - - // Reconstruction and Store - { - __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); - __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi64( - d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); - d2 = _mm_unpacklo_epi64( - d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); - d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); - d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); - // store input0 - _mm_storel_epi64((__m128i *)dest, d0); - // store input1 - d0 = _mm_srli_si128(d0, 8); - _mm_storel_epi64((__m128i *)(dest + stride), d0); - // store input2 - _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); - // store input3 - d2 = _mm_srli_si128(d2, 8); - _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[4], temp_out[4]; - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - av1_highbd_idct4_c(temp_in, temp_out, bd); - for (j = 0; j < 4; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); - } - } - } -} - -void av1_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8]; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[8]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i sixteen = _mm_set1_epi16(16); - const __m128i max = _mm_set1_epi16(6201); - const __m128i min = _mm_set1_epi16(-6201); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 8; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - av1_idct8_sse2(inptr); - - // Find the min & max for the column transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - array_transpose_8x8(inptr, inptr); - for (i = 0; i < 8; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); - temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 8; ++i) { - av1_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - } - - if (optimised_cols) { - av1_idct8_sse2(inptr); - - // Final round & shift and Reconstruction and Store - { - __m128i d[8]; - for (i = 0; i < 8; i++) { - inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[8], temp_out[8]; - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - av1_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } - } -} - -void av1_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8] = { 0 }; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[8]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i sixteen = _mm_set1_epi16(16); - const __m128i max = _mm_set1_epi16(6201); - const __m128i min = _mm_set1_epi16(-6201); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 8; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - // only first 4 row has non-zero coefs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 4; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - av1_idct8_sse2(inptr); - - // Find the min & max for the column transform - // N.B. Only first 4 cols contain non-zero coeffs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - // Use fact only first 4 rows contain non-zero coeffs - array_transpose_4X8(inptr, inptr); - for (i = 0; i < 4; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); - temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - av1_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - } - - if (optimised_cols) { - av1_idct8_sse2(inptr); - - // Final round & shift and Reconstruction and Store - { - __m128i d[8]; - for (i = 0; i < 8; i++) { - inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[8], temp_out[8]; - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - av1_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } - } -} - -void av1_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[16 * 16]; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[32]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_set1_epi16(32); - const __m128i max = _mm_set1_epi16(3155); - const __m128i min = _mm_set1_epi16(-3155); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 16; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); - inptr[i + 16] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 32; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - av1_idct16_sse2(inptr, inptr + 16); - - // Find the min & max for the column transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 32; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - array_transpose_16x16(inptr, inptr + 16); - for (i = 0; i < 16; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); - sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); - temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 16; ++i) { - av1_highbd_idct16_c(input, outptr, bd); - input += 16; - outptr += 16; - } - } - - if (optimised_cols) { - av1_idct16_sse2(inptr, inptr + 16); - - // Final round & shift and Reconstruction and Store - { - __m128i d[2]; - for (i = 0; i < 16; i++) { - inptr[i] = _mm_add_epi16(inptr[i], rounding); - inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); - d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); - inptr[i] = _mm_srai_epi16(inptr[i], 6); - inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); - _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[16], temp_out[16]; - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - av1_highbd_idct16_c(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } - } -} - -void av1_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[16 * 16] = { 0 }; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[32]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_set1_epi16(32); - const __m128i max = _mm_set1_epi16(3155); - const __m128i min = _mm_set1_epi16(-3155); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 16; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); - inptr[i + 16] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - // Since all non-zero dct coefficients are in upper-left 4x4 area, - // we only need to consider first 4 rows here. - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 4; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform (N.B. This transposes inptr) - av1_idct16_sse2(inptr, inptr + 16); - - // Find the min & max for the column transform - // N.B. Only first 4 cols contain non-zero coeffs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 16; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - // Use fact only first 4 rows contain non-zero coeffs - array_transpose_8x8(inptr, inptr); - array_transpose_8x8(inptr + 8, inptr + 16); - for (i = 0; i < 4; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); - sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); - temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - av1_highbd_idct16_c(input, outptr, bd); - input += 16; - outptr += 16; - } - } - - if (optimised_cols) { - av1_idct16_sse2(inptr, inptr + 16); - - // Final round & shift and Reconstruction and Store - { - __m128i d[2]; - for (i = 0; i < 16; i++) { - inptr[i] = _mm_add_epi16(inptr[i], rounding); - inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); - d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); - inptr[i] = _mm_srai_epi16(inptr[i], 6); - inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); - _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[16], temp_out[16]; - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - av1_highbd_idct16_c(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } - } -} -#endif // CONFIG_AOM_HIGHBITDEPTH
diff --git a/av1/common/x86/av1_inv_txfm_sse2.h b/av1/common/x86/av1_inv_txfm_sse2.h deleted file mode 100644 index a8bb6c1..0000000 --- a/av1/common/x86/av1_inv_txfm_sse2.h +++ /dev/null
@@ -1,178 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_X86_INV_TXFM_SSE2_H_ -#define AOM_DSP_X86_INV_TXFM_SSE2_H_ - -#include <emmintrin.h> // SSE2 -#include "./aom_config.h" -#include "aom/aom_integer.h" -#include "av1/common/av1_inv_txfm.h" - -// perform 8x8 transpose -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); -} - -#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - \ - in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ - in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ - } - -static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - - out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); - out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); - out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); - out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); -} - -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - -static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { - in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); - in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); - in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); - in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); - in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); - in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); - in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); - in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); - - in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); - in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); - in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); - in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); - in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); - in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); - in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); - in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); -} - -#define RECON_AND_STORE(dest, in_x) \ - { \ - __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - _mm_storel_epi64((__m128i *)(dest), d0); \ - } - -static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - // Final rounding and shift - in[0] = _mm_adds_epi16(in[0], final_rounding); - in[1] = _mm_adds_epi16(in[1], final_rounding); - in[2] = _mm_adds_epi16(in[2], final_rounding); - in[3] = _mm_adds_epi16(in[3], final_rounding); - in[4] = _mm_adds_epi16(in[4], final_rounding); - in[5] = _mm_adds_epi16(in[5], final_rounding); - in[6] = _mm_adds_epi16(in[6], final_rounding); - in[7] = _mm_adds_epi16(in[7], final_rounding); - in[8] = _mm_adds_epi16(in[8], final_rounding); - in[9] = _mm_adds_epi16(in[9], final_rounding); - in[10] = _mm_adds_epi16(in[10], final_rounding); - in[11] = _mm_adds_epi16(in[11], final_rounding); - in[12] = _mm_adds_epi16(in[12], final_rounding); - in[13] = _mm_adds_epi16(in[13], final_rounding); - in[14] = _mm_adds_epi16(in[14], final_rounding); - in[15] = _mm_adds_epi16(in[15], final_rounding); - - in[0] = _mm_srai_epi16(in[0], 6); - in[1] = _mm_srai_epi16(in[1], 6); - in[2] = _mm_srai_epi16(in[2], 6); - in[3] = _mm_srai_epi16(in[3], 6); - in[4] = _mm_srai_epi16(in[4], 6); - in[5] = _mm_srai_epi16(in[5], 6); - in[6] = _mm_srai_epi16(in[6], 6); - in[7] = _mm_srai_epi16(in[7], 6); - in[8] = _mm_srai_epi16(in[8], 6); - in[9] = _mm_srai_epi16(in[9], 6); - in[10] = _mm_srai_epi16(in[10], 6); - in[11] = _mm_srai_epi16(in[11], 6); - in[12] = _mm_srai_epi16(in[12], 6); - in[13] = _mm_srai_epi16(in[13], 6); - in[14] = _mm_srai_epi16(in[14], 6); - in[15] = _mm_srai_epi16(in[15], 6); - - RECON_AND_STORE(dest + 0 * stride, in[0]); - RECON_AND_STORE(dest + 1 * stride, in[1]); - RECON_AND_STORE(dest + 2 * stride, in[2]); - RECON_AND_STORE(dest + 3 * stride, in[3]); - RECON_AND_STORE(dest + 4 * stride, in[4]); - RECON_AND_STORE(dest + 5 * stride, in[5]); - RECON_AND_STORE(dest + 6 * stride, in[6]); - RECON_AND_STORE(dest + 7 * stride, in[7]); - RECON_AND_STORE(dest + 8 * stride, in[8]); - RECON_AND_STORE(dest + 9 * stride, in[9]); - RECON_AND_STORE(dest + 10 * stride, in[10]); - RECON_AND_STORE(dest + 11 * stride, in[11]); - RECON_AND_STORE(dest + 12 * stride, in[12]); - RECON_AND_STORE(dest + 13 * stride, in[13]); - RECON_AND_STORE(dest + 14 * stride, in[14]); - RECON_AND_STORE(dest + 15 * stride, in[15]); -} - -#endif // AOM_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/test/av1_inv_txfm_test.cc b/test/av1_inv_txfm_test.cc index 84e2402..8f6c868 100644 --- a/test/av1_inv_txfm_test.cc +++ b/test/av1_inv_txfm_test.cc
@@ -24,7 +24,7 @@ #include "av1/common/blockd.h" #include "av1/common/scan.h" #include "aom/aom_integer.h" -#include "av1/common/av1_inv_txfm.h" +#include "aom_dsp/inv_txfm.h" using libaom_test::ACMRandom; @@ -104,10 +104,10 @@ INSTANTIATE_TEST_CASE_P( C, AV1InvTxfm, - ::testing::Values(IdctParam(&av1_idct4_c, &reference_idct_1d, 4, 1), - IdctParam(&av1_idct8_c, &reference_idct_1d, 8, 2), - IdctParam(&av1_idct16_c, &reference_idct_1d, 16, 4), - IdctParam(&av1_idct32_c, &reference_idct_1d, 32, 6))); + ::testing::Values(IdctParam(&aom_idct4_c, &reference_idct_1d, 4, 1), + IdctParam(&aom_idct8_c, &reference_idct_1d, 8, 2), + IdctParam(&aom_idct16_c, &reference_idct_1d, 16, 4), + IdctParam(&aom_idct32_c, &reference_idct_1d, 32, 6))); #if CONFIG_AV1_ENCODER typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride); @@ -262,19 +262,19 @@ INSTANTIATE_TEST_CASE_P( C, AV1PartialIDctTest, - ::testing::Values(make_tuple(&av1_fdct32x32_c, &av1_idct32x32_1024_add_c, - &av1_idct32x32_34_add_c, TX_32X32, 34), - make_tuple(&av1_fdct32x32_c, &av1_idct32x32_1024_add_c, - &av1_idct32x32_1_add_c, TX_32X32, 1), - make_tuple(&av1_fdct16x16_c, &av1_idct16x16_256_add_c, - &av1_idct16x16_10_add_c, TX_16X16, 10), - make_tuple(&av1_fdct16x16_c, &av1_idct16x16_256_add_c, - &av1_idct16x16_1_add_c, TX_16X16, 1), - make_tuple(&av1_fdct8x8_c, &av1_idct8x8_64_add_c, - &av1_idct8x8_12_add_c, TX_8X8, 12), - make_tuple(&av1_fdct8x8_c, &av1_idct8x8_64_add_c, - &av1_idct8x8_1_add_c, TX_8X8, 1), - make_tuple(&av1_fdct4x4_c, &av1_idct4x4_16_add_c, - &av1_idct4x4_1_add_c, TX_4X4, 1))); + ::testing::Values(make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c, + &aom_idct32x32_34_add_c, TX_32X32, 34), + make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c, + &aom_idct32x32_1_add_c, TX_32X32, 1), + make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c, + &aom_idct16x16_10_add_c, TX_16X16, 10), + make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c, + &aom_idct16x16_1_add_c, TX_16X16, 1), + make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c, + &aom_idct8x8_12_add_c, TX_8X8, 12), + make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c, + &aom_idct8x8_1_add_c, TX_8X8, 1), + make_tuple(&aom_fdct4x4_c, &aom_idct4x4_16_add_c, + &aom_idct4x4_1_add_c, TX_4X4, 1))); #endif // CONFIG_AV1_ENCODER } // namespace