Merge "Make rectangular transform block available in the common lib" into nextgenv2
diff --git a/aom_dsp/prob.h b/aom_dsp/prob.h
index 28ea49c..9384ffe 100644
--- a/aom_dsp/prob.h
+++ b/aom_dsp/prob.h
@@ -33,7 +33,7 @@
typedef int8_t aom_tree_index;
-#define TREE_SIZE(leaf_count) (2 * (leaf_count)-2)
+#define TREE_SIZE(leaf_count) (-2 + 2 * (leaf_count))
#define aom_complement(x) (255 - x)
diff --git a/aom_dsp/x86/txfm_common_avx2.h b/aom_dsp/x86/txfm_common_avx2.h
index 7dc17f0..39e9b8e 100644
--- a/aom_dsp/x86/txfm_common_avx2.h
+++ b/aom_dsp/x86/txfm_common_avx2.h
@@ -14,6 +14,8 @@
#include <immintrin.h>
+#include "aom_dsp/txfm_common.h"
+
#define pair256_set_epi16(a, b) \
_mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
@@ -24,4 +26,179 @@
_mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
(int)(b), (int)(a))
+static INLINE void mm256_reverse_epi16(__m256i *u) {
+ const __m256i control = _mm256_set_epi16(
+ 0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100,
+ 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E);
+ __m256i v = _mm256_shuffle_epi8(*u, control);
+ *u = _mm256_permute2x128_si256(v, v, 1);
+}
+
+static INLINE void mm256_transpose_16x16(__m256i *in) {
+ __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
+ __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
+ __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
+ __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
+ __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
+ __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
+ __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
+ __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+ __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
+ __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
+ __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
+ __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
+ __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
+ __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
+ __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
+ __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
+
+ // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b
+ // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f
+ // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b
+ // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f
+ // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b
+ // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f
+ // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b
+ // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f
+
+ // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b
+ // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f
+ // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb
+ // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf
+ // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db
+ // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df
+ // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb
+ // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff
+
+ __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
+ __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
+ __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
+ __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
+ __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
+ __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
+ __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
+ __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
+
+ __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
+ __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
+ __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
+ __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
+ __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
+ __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
+ __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
+ __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
+
+ // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39
+ // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b
+ // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d
+ // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f
+ // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79
+ // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b
+ // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d
+ // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f
+
+ // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9
+ // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb
+ // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd
+ // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf
+ // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9
+ // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb
+ // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd
+ // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff
+
+ tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+ tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+ tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+ tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+ tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+ tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+ tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+ tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+
+ tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
+ tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
+ tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
+ tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
+ tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
+ tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
+ tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
+ tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
+
+ // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
+ // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
+ // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a
+ // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b
+ // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c
+ // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d
+ // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e
+ // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f
+
+ // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8
+ // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9
+ // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa
+ // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb
+ // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc
+ // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd
+ // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe
+ // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff
+
+ in[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000
+ in[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001
+ in[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
+ in[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
+ in[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
+ in[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
+ in[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
+ in[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
+
+ in[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
+ in[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
+ in[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
+ in[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
+ in[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
+ in[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
+ in[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
+ in[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
+}
+
+static INLINE __m256i butter_fly(__m256i a0, __m256i a1, const __m256i cospi) {
+ const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+ __m256i y0 = _mm256_madd_epi16(a0, cospi);
+ __m256i y1 = _mm256_madd_epi16(a1, cospi);
+
+ y0 = _mm256_add_epi32(y0, dct_rounding);
+ y1 = _mm256_add_epi32(y1, dct_rounding);
+ y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS);
+ y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS);
+
+ return _mm256_packs_epi32(y0, y1);
+}
+
+static INLINE void txfm_scaling16_avx2(const int16_t c, __m256i *in) {
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i sqrt2_epi16 = _mm256_set1_epi16(c);
+ const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+ __m256i u0, u1;
+ int i = 0;
+
+ while (i < 16) {
+ in[i] = _mm256_slli_epi16(in[i], 1);
+
+ u0 = _mm256_unpacklo_epi16(zero, in[i]);
+ u1 = _mm256_unpackhi_epi16(zero, in[i]);
+
+ u0 = _mm256_madd_epi16(u0, sqrt2_epi16);
+ u1 = _mm256_madd_epi16(u1, sqrt2_epi16);
+
+ u0 = _mm256_add_epi32(u0, dct_const_rounding);
+ u1 = _mm256_add_epi32(u1, dct_const_rounding);
+
+ u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
+ u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
+ in[i] = _mm256_packs_epi32(u0, u1);
+ i++;
+ }
+}
+
#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 43b76ad..0fe4a89 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -30,8 +30,6 @@
AV1_COMMON_SRCS-yes += common/filter.c
AV1_COMMON_SRCS-yes += common/idct.h
AV1_COMMON_SRCS-yes += common/idct.c
-AV1_COMMON_SRCS-yes += common/av1_inv_txfm.h
-AV1_COMMON_SRCS-yes += common/av1_inv_txfm.c
AV1_COMMON_SRCS-yes += common/loopfilter.h
AV1_COMMON_SRCS-yes += common/thread_common.h
AV1_COMMON_SRCS-yes += common/mv.h
@@ -61,8 +59,6 @@
AV1_COMMON_SRCS-yes += common/scan.c
AV1_COMMON_SRCS-yes += common/scan.h
# TODO(angiebird) the forward transform belongs under encoder/
-AV1_COMMON_SRCS-$(CONFIG_AV1_ENCODER) += common/av1_fwd_txfm.h
-AV1_COMMON_SRCS-$(CONFIG_AV1_ENCODER) += common/av1_fwd_txfm.c
AV1_COMMON_SRCS-yes += common/av1_txfm.h
AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.h
AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.c
@@ -122,10 +118,9 @@
AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct16x16_msa.c
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_intrin_sse2.c
+AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/hybrid_inv_txfm_avx2.c
+
ifeq ($(CONFIG_AV1_ENCODER),yes)
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_txfm_sse2.c
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_dct32x32_impl_sse2.h
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_txfm_impl_sse2.h
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_txfm1d_sse4.h
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_fwd_txfm1d_sse4.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_fwd_txfm2d_sse4.c
@@ -143,7 +138,4 @@
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/filterintra_sse4.c
endif
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_inv_txfm_sse2.c
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_inv_txfm_sse2.h
-
$(eval $(call rtcd_h_template,av1_rtcd,av1/common/av1_rtcd_defs.pl))
diff --git a/av1/common/av1_fwd_txfm.c b/av1/common/av1_fwd_txfm.c
deleted file mode 100644
index 84a3876..0000000
--- a/av1/common/av1_fwd_txfm.c
+++ /dev/null
@@ -1,813 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "av1/common/av1_fwd_txfm.h"
-#include <assert.h>
-#include "./av1_rtcd.h"
-
-void av1_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
- // The 2D transform is done with two passes which are actually pretty
- // similar. In the first one, we transform the columns and transpose
- // the results. In the second one, we transform the rows. To achieve that,
- // as the first pass results are transposed, we transpose the columns (that
- // is the transposed rows) and transpose the results (so that it goes back
- // in normal/row positions).
- int pass;
- // We need an intermediate buffer between passes.
- tran_low_t intermediate[4 * 4];
- const tran_low_t *in_low = NULL;
- tran_low_t *out = intermediate;
- // Do the two transform/transpose passes
- for (pass = 0; pass < 2; ++pass) {
- tran_high_t in_high[4]; // canbe16
- tran_high_t step[4]; // canbe16
- tran_high_t temp1, temp2; // needs32
- int i;
- for (i = 0; i < 4; ++i) {
- // Load inputs.
- if (0 == pass) {
- in_high[0] = input[0 * stride] * 16;
- in_high[1] = input[1 * stride] * 16;
- in_high[2] = input[2 * stride] * 16;
- in_high[3] = input[3 * stride] * 16;
- if (i == 0 && in_high[0]) {
- in_high[0] += 1;
- }
- } else {
- assert(in_low != NULL);
- in_high[0] = in_low[0 * 4];
- in_high[1] = in_low[1 * 4];
- in_high[2] = in_low[2 * 4];
- in_high[3] = in_low[3 * 4];
- in_low++;
- }
- // Transform.
- step[0] = in_high[0] + in_high[3];
- step[1] = in_high[1] + in_high[2];
- step[2] = in_high[1] - in_high[2];
- step[3] = in_high[0] - in_high[3];
- temp1 = (step[0] + step[1]) * cospi_16_64;
- temp2 = (step[0] - step[1]) * cospi_16_64;
- out[0] = (tran_low_t)fdct_round_shift(temp1);
- out[2] = (tran_low_t)fdct_round_shift(temp2);
- temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
- temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
- out[1] = (tran_low_t)fdct_round_shift(temp1);
- out[3] = (tran_low_t)fdct_round_shift(temp2);
- // Do next column (which is a transposed row in second/horizontal pass)
- input++;
- out += 4;
- }
- // Setup in_low/out for next pass.
- in_low = intermediate;
- out = output;
- }
-
- {
- int i, j;
- for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
- }
- }
-}
-
-void av1_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
- int r, c;
- tran_low_t sum = 0;
- for (r = 0; r < 4; ++r)
- for (c = 0; c < 4; ++c) sum += input[r * stride + c];
-
- output[0] = sum << 1;
- output[1] = 0;
-}
-
-void av1_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
- int i, j;
- tran_low_t intermediate[64];
- int pass;
- tran_low_t *output = intermediate;
- const tran_low_t *in = NULL;
-
- // Transform columns
- for (pass = 0; pass < 2; ++pass) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
- tran_high_t t0, t1, t2, t3; // needs32
- tran_high_t x0, x1, x2, x3; // canbe16
-
- for (i = 0; i < 8; i++) {
- // stage 1
- if (pass == 0) {
- s0 = (input[0 * stride] + input[7 * stride]) * 4;
- s1 = (input[1 * stride] + input[6 * stride]) * 4;
- s2 = (input[2 * stride] + input[5 * stride]) * 4;
- s3 = (input[3 * stride] + input[4 * stride]) * 4;
- s4 = (input[3 * stride] - input[4 * stride]) * 4;
- s5 = (input[2 * stride] - input[5 * stride]) * 4;
- s6 = (input[1 * stride] - input[6 * stride]) * 4;
- s7 = (input[0 * stride] - input[7 * stride]) * 4;
- ++input;
- } else {
- s0 = in[0 * 8] + in[7 * 8];
- s1 = in[1 * 8] + in[6 * 8];
- s2 = in[2 * 8] + in[5 * 8];
- s3 = in[3 * 8] + in[4 * 8];
- s4 = in[3 * 8] - in[4 * 8];
- s5 = in[2 * 8] - in[5 * 8];
- s6 = in[1 * 8] - in[6 * 8];
- s7 = in[0 * 8] - in[7 * 8];
- ++in;
- }
-
- // fdct4(step, step);
- x0 = s0 + s3;
- x1 = s1 + s2;
- x2 = s1 - s2;
- x3 = s0 - s3;
- t0 = (x0 + x1) * cospi_16_64;
- t1 = (x0 - x1) * cospi_16_64;
- t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
- t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
- output[0] = (tran_low_t)fdct_round_shift(t0);
- output[2] = (tran_low_t)fdct_round_shift(t2);
- output[4] = (tran_low_t)fdct_round_shift(t1);
- output[6] = (tran_low_t)fdct_round_shift(t3);
-
- // Stage 2
- t0 = (s6 - s5) * cospi_16_64;
- t1 = (s6 + s5) * cospi_16_64;
- t2 = fdct_round_shift(t0);
- t3 = fdct_round_shift(t1);
-
- // Stage 3
- x0 = s4 + t2;
- x1 = s4 - t2;
- x2 = s7 - t3;
- x3 = s7 + t3;
-
- // Stage 4
- t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
- t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
- t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
- t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
- output[1] = (tran_low_t)fdct_round_shift(t0);
- output[3] = (tran_low_t)fdct_round_shift(t2);
- output[5] = (tran_low_t)fdct_round_shift(t1);
- output[7] = (tran_low_t)fdct_round_shift(t3);
- output += 8;
- }
- in = intermediate;
- output = final_output;
- }
-
- // Rows
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
- }
-}
-
-void av1_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
- int r, c;
- tran_low_t sum = 0;
- for (r = 0; r < 8; ++r)
- for (c = 0; c < 8; ++c) sum += input[r * stride + c];
-
- output[0] = sum;
- output[1] = 0;
-}
-
-void av1_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
- // The 2D transform is done with two passes which are actually pretty
- // similar. In the first one, we transform the columns and transpose
- // the results. In the second one, we transform the rows. To achieve that,
- // as the first pass results are transposed, we transpose the columns (that
- // is the transposed rows) and transpose the results (so that it goes back
- // in normal/row positions).
- int pass;
- // We need an intermediate buffer between passes.
- tran_low_t intermediate[256];
- const tran_low_t *in_low = NULL;
- tran_low_t *out = intermediate;
- // Do the two transform/transpose passes
- for (pass = 0; pass < 2; ++pass) {
- tran_high_t step1[8]; // canbe16
- tran_high_t step2[8]; // canbe16
- tran_high_t step3[8]; // canbe16
- tran_high_t in_high[8]; // canbe16
- tran_high_t temp1, temp2; // needs32
- int i;
- for (i = 0; i < 16; i++) {
- if (0 == pass) {
- // Calculate input for the first 8 results.
- in_high[0] = (input[0 * stride] + input[15 * stride]) * 4;
- in_high[1] = (input[1 * stride] + input[14 * stride]) * 4;
- in_high[2] = (input[2 * stride] + input[13 * stride]) * 4;
- in_high[3] = (input[3 * stride] + input[12 * stride]) * 4;
- in_high[4] = (input[4 * stride] + input[11 * stride]) * 4;
- in_high[5] = (input[5 * stride] + input[10 * stride]) * 4;
- in_high[6] = (input[6 * stride] + input[9 * stride]) * 4;
- in_high[7] = (input[7 * stride] + input[8 * stride]) * 4;
- // Calculate input for the next 8 results.
- step1[0] = (input[7 * stride] - input[8 * stride]) * 4;
- step1[1] = (input[6 * stride] - input[9 * stride]) * 4;
- step1[2] = (input[5 * stride] - input[10 * stride]) * 4;
- step1[3] = (input[4 * stride] - input[11 * stride]) * 4;
- step1[4] = (input[3 * stride] - input[12 * stride]) * 4;
- step1[5] = (input[2 * stride] - input[13 * stride]) * 4;
- step1[6] = (input[1 * stride] - input[14 * stride]) * 4;
- step1[7] = (input[0 * stride] - input[15 * stride]) * 4;
- } else {
- // Calculate input for the first 8 results.
- assert(in_low != NULL);
- in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2);
- in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2);
- in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2);
- in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2);
- in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2);
- in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2);
- in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2);
- in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2);
- // Calculate input for the next 8 results.
- step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2);
- step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2);
- step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2);
- step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2);
- step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2);
- step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2);
- step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2);
- step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2);
- in_low++;
- }
- // Work on the first eight values; fdct8(input, even_results);
- {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
- tran_high_t t0, t1, t2, t3; // needs32
- tran_high_t x0, x1, x2, x3; // canbe16
-
- // stage 1
- s0 = in_high[0] + in_high[7];
- s1 = in_high[1] + in_high[6];
- s2 = in_high[2] + in_high[5];
- s3 = in_high[3] + in_high[4];
- s4 = in_high[3] - in_high[4];
- s5 = in_high[2] - in_high[5];
- s6 = in_high[1] - in_high[6];
- s7 = in_high[0] - in_high[7];
-
- // fdct4(step, step);
- x0 = s0 + s3;
- x1 = s1 + s2;
- x2 = s1 - s2;
- x3 = s0 - s3;
- t0 = (x0 + x1) * cospi_16_64;
- t1 = (x0 - x1) * cospi_16_64;
- t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
- t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
- out[0] = (tran_low_t)fdct_round_shift(t0);
- out[4] = (tran_low_t)fdct_round_shift(t2);
- out[8] = (tran_low_t)fdct_round_shift(t1);
- out[12] = (tran_low_t)fdct_round_shift(t3);
-
- // Stage 2
- t0 = (s6 - s5) * cospi_16_64;
- t1 = (s6 + s5) * cospi_16_64;
- t2 = fdct_round_shift(t0);
- t3 = fdct_round_shift(t1);
-
- // Stage 3
- x0 = s4 + t2;
- x1 = s4 - t2;
- x2 = s7 - t3;
- x3 = s7 + t3;
-
- // Stage 4
- t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
- t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
- t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
- t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
- out[2] = (tran_low_t)fdct_round_shift(t0);
- out[6] = (tran_low_t)fdct_round_shift(t2);
- out[10] = (tran_low_t)fdct_round_shift(t1);
- out[14] = (tran_low_t)fdct_round_shift(t3);
- }
- // Work on the next eight values; step1 -> odd_results
- {
- // step 2
- temp1 = (step1[5] - step1[2]) * cospi_16_64;
- temp2 = (step1[4] - step1[3]) * cospi_16_64;
- step2[2] = fdct_round_shift(temp1);
- step2[3] = fdct_round_shift(temp2);
- temp1 = (step1[4] + step1[3]) * cospi_16_64;
- temp2 = (step1[5] + step1[2]) * cospi_16_64;
- step2[4] = fdct_round_shift(temp1);
- step2[5] = fdct_round_shift(temp2);
- // step 3
- step3[0] = step1[0] + step2[3];
- step3[1] = step1[1] + step2[2];
- step3[2] = step1[1] - step2[2];
- step3[3] = step1[0] - step2[3];
- step3[4] = step1[7] - step2[4];
- step3[5] = step1[6] - step2[5];
- step3[6] = step1[6] + step2[5];
- step3[7] = step1[7] + step2[4];
- // step 4
- temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
- temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
- step2[1] = fdct_round_shift(temp1);
- step2[2] = fdct_round_shift(temp2);
- temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
- temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
- step2[5] = fdct_round_shift(temp1);
- step2[6] = fdct_round_shift(temp2);
- // step 5
- step1[0] = step3[0] + step2[1];
- step1[1] = step3[0] - step2[1];
- step1[2] = step3[3] + step2[2];
- step1[3] = step3[3] - step2[2];
- step1[4] = step3[4] - step2[5];
- step1[5] = step3[4] + step2[5];
- step1[6] = step3[7] - step2[6];
- step1[7] = step3[7] + step2[6];
- // step 6
- temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
- temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
- out[1] = (tran_low_t)fdct_round_shift(temp1);
- out[9] = (tran_low_t)fdct_round_shift(temp2);
- temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
- temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
- out[5] = (tran_low_t)fdct_round_shift(temp1);
- out[13] = (tran_low_t)fdct_round_shift(temp2);
- temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
- temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
- out[3] = (tran_low_t)fdct_round_shift(temp1);
- out[11] = (tran_low_t)fdct_round_shift(temp2);
- temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
- temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
- out[7] = (tran_low_t)fdct_round_shift(temp1);
- out[15] = (tran_low_t)fdct_round_shift(temp2);
- }
- // Do next column (which is a transposed row in second/horizontal pass)
- input++;
- out += 16;
- }
- // Setup in/out for next pass.
- in_low = intermediate;
- out = output;
- }
-}
-
-void av1_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
- int r, c;
- tran_low_t sum = 0;
- for (r = 0; r < 16; ++r)
- for (c = 0; c < 16; ++c) sum += input[r * stride + c];
-
- output[0] = sum >> 1;
- output[1] = 0;
-}
-
-static INLINE tran_high_t dct_32_round(tran_high_t input) {
- tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
- // TODO(debargha, peter.derivaz): Find new bounds for this assert,
- // and make the bounds consts.
- // assert(-131072 <= rv && rv <= 131071);
- return rv;
-}
-
-static INLINE tran_high_t half_round_shift(tran_high_t input) {
- tran_high_t rv = (input + 1 + (input < 0)) >> 2;
- return rv;
-}
-
-void av1_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
- tran_high_t step[32];
- // Stage 1
- step[0] = input[0] + input[(32 - 1)];
- step[1] = input[1] + input[(32 - 2)];
- step[2] = input[2] + input[(32 - 3)];
- step[3] = input[3] + input[(32 - 4)];
- step[4] = input[4] + input[(32 - 5)];
- step[5] = input[5] + input[(32 - 6)];
- step[6] = input[6] + input[(32 - 7)];
- step[7] = input[7] + input[(32 - 8)];
- step[8] = input[8] + input[(32 - 9)];
- step[9] = input[9] + input[(32 - 10)];
- step[10] = input[10] + input[(32 - 11)];
- step[11] = input[11] + input[(32 - 12)];
- step[12] = input[12] + input[(32 - 13)];
- step[13] = input[13] + input[(32 - 14)];
- step[14] = input[14] + input[(32 - 15)];
- step[15] = input[15] + input[(32 - 16)];
- step[16] = -input[16] + input[(32 - 17)];
- step[17] = -input[17] + input[(32 - 18)];
- step[18] = -input[18] + input[(32 - 19)];
- step[19] = -input[19] + input[(32 - 20)];
- step[20] = -input[20] + input[(32 - 21)];
- step[21] = -input[21] + input[(32 - 22)];
- step[22] = -input[22] + input[(32 - 23)];
- step[23] = -input[23] + input[(32 - 24)];
- step[24] = -input[24] + input[(32 - 25)];
- step[25] = -input[25] + input[(32 - 26)];
- step[26] = -input[26] + input[(32 - 27)];
- step[27] = -input[27] + input[(32 - 28)];
- step[28] = -input[28] + input[(32 - 29)];
- step[29] = -input[29] + input[(32 - 30)];
- step[30] = -input[30] + input[(32 - 31)];
- step[31] = -input[31] + input[(32 - 32)];
-
- // Stage 2
- output[0] = step[0] + step[16 - 1];
- output[1] = step[1] + step[16 - 2];
- output[2] = step[2] + step[16 - 3];
- output[3] = step[3] + step[16 - 4];
- output[4] = step[4] + step[16 - 5];
- output[5] = step[5] + step[16 - 6];
- output[6] = step[6] + step[16 - 7];
- output[7] = step[7] + step[16 - 8];
- output[8] = -step[8] + step[16 - 9];
- output[9] = -step[9] + step[16 - 10];
- output[10] = -step[10] + step[16 - 11];
- output[11] = -step[11] + step[16 - 12];
- output[12] = -step[12] + step[16 - 13];
- output[13] = -step[13] + step[16 - 14];
- output[14] = -step[14] + step[16 - 15];
- output[15] = -step[15] + step[16 - 16];
-
- output[16] = step[16];
- output[17] = step[17];
- output[18] = step[18];
- output[19] = step[19];
-
- output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
- output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
- output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
- output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
-
- output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
- output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
- output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
- output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
-
- output[28] = step[28];
- output[29] = step[29];
- output[30] = step[30];
- output[31] = step[31];
-
- // dump the magnitude by 4, hence the intermediate values are within
- // the range of 16 bits.
- if (round) {
- output[0] = half_round_shift(output[0]);
- output[1] = half_round_shift(output[1]);
- output[2] = half_round_shift(output[2]);
- output[3] = half_round_shift(output[3]);
- output[4] = half_round_shift(output[4]);
- output[5] = half_round_shift(output[5]);
- output[6] = half_round_shift(output[6]);
- output[7] = half_round_shift(output[7]);
- output[8] = half_round_shift(output[8]);
- output[9] = half_round_shift(output[9]);
- output[10] = half_round_shift(output[10]);
- output[11] = half_round_shift(output[11]);
- output[12] = half_round_shift(output[12]);
- output[13] = half_round_shift(output[13]);
- output[14] = half_round_shift(output[14]);
- output[15] = half_round_shift(output[15]);
-
- output[16] = half_round_shift(output[16]);
- output[17] = half_round_shift(output[17]);
- output[18] = half_round_shift(output[18]);
- output[19] = half_round_shift(output[19]);
- output[20] = half_round_shift(output[20]);
- output[21] = half_round_shift(output[21]);
- output[22] = half_round_shift(output[22]);
- output[23] = half_round_shift(output[23]);
- output[24] = half_round_shift(output[24]);
- output[25] = half_round_shift(output[25]);
- output[26] = half_round_shift(output[26]);
- output[27] = half_round_shift(output[27]);
- output[28] = half_round_shift(output[28]);
- output[29] = half_round_shift(output[29]);
- output[30] = half_round_shift(output[30]);
- output[31] = half_round_shift(output[31]);
- }
-
- // Stage 3
- step[0] = output[0] + output[(8 - 1)];
- step[1] = output[1] + output[(8 - 2)];
- step[2] = output[2] + output[(8 - 3)];
- step[3] = output[3] + output[(8 - 4)];
- step[4] = -output[4] + output[(8 - 5)];
- step[5] = -output[5] + output[(8 - 6)];
- step[6] = -output[6] + output[(8 - 7)];
- step[7] = -output[7] + output[(8 - 8)];
- step[8] = output[8];
- step[9] = output[9];
- step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
- step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
- step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
- step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
- step[14] = output[14];
- step[15] = output[15];
-
- step[16] = output[16] + output[23];
- step[17] = output[17] + output[22];
- step[18] = output[18] + output[21];
- step[19] = output[19] + output[20];
- step[20] = -output[20] + output[19];
- step[21] = -output[21] + output[18];
- step[22] = -output[22] + output[17];
- step[23] = -output[23] + output[16];
- step[24] = -output[24] + output[31];
- step[25] = -output[25] + output[30];
- step[26] = -output[26] + output[29];
- step[27] = -output[27] + output[28];
- step[28] = output[28] + output[27];
- step[29] = output[29] + output[26];
- step[30] = output[30] + output[25];
- step[31] = output[31] + output[24];
-
- // Stage 4
- output[0] = step[0] + step[3];
- output[1] = step[1] + step[2];
- output[2] = -step[2] + step[1];
- output[3] = -step[3] + step[0];
- output[4] = step[4];
- output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
- output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
- output[7] = step[7];
- output[8] = step[8] + step[11];
- output[9] = step[9] + step[10];
- output[10] = -step[10] + step[9];
- output[11] = -step[11] + step[8];
- output[12] = -step[12] + step[15];
- output[13] = -step[13] + step[14];
- output[14] = step[14] + step[13];
- output[15] = step[15] + step[12];
-
- output[16] = step[16];
- output[17] = step[17];
- output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
- output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
- output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
- output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
- output[22] = step[22];
- output[23] = step[23];
- output[24] = step[24];
- output[25] = step[25];
- output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
- output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
- output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
- output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
- output[30] = step[30];
- output[31] = step[31];
-
- // Stage 5
- step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
- step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
- step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
- step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
- step[4] = output[4] + output[5];
- step[5] = -output[5] + output[4];
- step[6] = -output[6] + output[7];
- step[7] = output[7] + output[6];
- step[8] = output[8];
- step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
- step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
- step[11] = output[11];
- step[12] = output[12];
- step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
- step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
- step[15] = output[15];
-
- step[16] = output[16] + output[19];
- step[17] = output[17] + output[18];
- step[18] = -output[18] + output[17];
- step[19] = -output[19] + output[16];
- step[20] = -output[20] + output[23];
- step[21] = -output[21] + output[22];
- step[22] = output[22] + output[21];
- step[23] = output[23] + output[20];
- step[24] = output[24] + output[27];
- step[25] = output[25] + output[26];
- step[26] = -output[26] + output[25];
- step[27] = -output[27] + output[24];
- step[28] = -output[28] + output[31];
- step[29] = -output[29] + output[30];
- step[30] = output[30] + output[29];
- step[31] = output[31] + output[28];
-
- // Stage 6
- output[0] = step[0];
- output[1] = step[1];
- output[2] = step[2];
- output[3] = step[3];
- output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
- output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
- output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
- output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
- output[8] = step[8] + step[9];
- output[9] = -step[9] + step[8];
- output[10] = -step[10] + step[11];
- output[11] = step[11] + step[10];
- output[12] = step[12] + step[13];
- output[13] = -step[13] + step[12];
- output[14] = -step[14] + step[15];
- output[15] = step[15] + step[14];
-
- output[16] = step[16];
- output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
- output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
- output[19] = step[19];
- output[20] = step[20];
- output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
- output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
- output[23] = step[23];
- output[24] = step[24];
- output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
- output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
- output[27] = step[27];
- output[28] = step[28];
- output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
- output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
- output[31] = step[31];
-
- // Stage 7
- step[0] = output[0];
- step[1] = output[1];
- step[2] = output[2];
- step[3] = output[3];
- step[4] = output[4];
- step[5] = output[5];
- step[6] = output[6];
- step[7] = output[7];
- step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
- step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
- step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
- step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
- step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
- step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
- step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
- step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
-
- step[16] = output[16] + output[17];
- step[17] = -output[17] + output[16];
- step[18] = -output[18] + output[19];
- step[19] = output[19] + output[18];
- step[20] = output[20] + output[21];
- step[21] = -output[21] + output[20];
- step[22] = -output[22] + output[23];
- step[23] = output[23] + output[22];
- step[24] = output[24] + output[25];
- step[25] = -output[25] + output[24];
- step[26] = -output[26] + output[27];
- step[27] = output[27] + output[26];
- step[28] = output[28] + output[29];
- step[29] = -output[29] + output[28];
- step[30] = -output[30] + output[31];
- step[31] = output[31] + output[30];
-
- // Final stage --- outputs indices are bit-reversed.
- output[0] = step[0];
- output[16] = step[1];
- output[8] = step[2];
- output[24] = step[3];
- output[4] = step[4];
- output[20] = step[5];
- output[12] = step[6];
- output[28] = step[7];
- output[2] = step[8];
- output[18] = step[9];
- output[10] = step[10];
- output[26] = step[11];
- output[6] = step[12];
- output[22] = step[13];
- output[14] = step[14];
- output[30] = step[15];
-
- output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
- output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
- output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
- output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
- output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
- output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
- output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
- output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
- output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
- output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
- output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
- output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
- output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
- output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
- output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
- output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
-}
-
-void av1_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
- int i, j;
- tran_high_t output[32 * 32];
-
- // Columns
- for (i = 0; i < 32; ++i) {
- tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
- av1_fdct32(temp_in, temp_out, 0);
- for (j = 0; j < 32; ++j)
- output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
- }
-
- // Rows
- for (i = 0; i < 32; ++i) {
- tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
- av1_fdct32(temp_in, temp_out, 0);
- for (j = 0; j < 32; ++j)
- out[j + i * 32] =
- (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
- }
-}
-
-// Note that although we use dct_32_round in dct32 computation flow,
-// this 2d fdct32x32 for rate-distortion optimization loop is operating
-// within 16 bits precision.
-void av1_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
- int i, j;
- tran_high_t output[32 * 32];
-
- // Columns
- for (i = 0; i < 32; ++i) {
- tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
- av1_fdct32(temp_in, temp_out, 0);
- for (j = 0; j < 32; ++j)
- // TODO(cd): see quality impact of only doing
- // output[j * 32 + i] = (temp_out[j] + 1) >> 2;
- // PS: also change code in av1_dsp/x86/av1_dct_sse2.c
- output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
- }
-
- // Rows
- for (i = 0; i < 32; ++i) {
- tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
- av1_fdct32(temp_in, temp_out, 1);
- for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
- }
-}
-
-void av1_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
- int r, c;
- tran_low_t sum = 0;
- for (r = 0; r < 32; ++r)
- for (c = 0; c < 32; ++c) sum += input[r * stride + c];
-
- output[0] = sum >> 3;
- output[1] = 0;
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void av1_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
- int stride) {
- av1_fdct4x4_c(input, output, stride);
-}
-
-void av1_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
- int stride) {
- av1_fdct8x8_c(input, final_output, stride);
-}
-
-void av1_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
- int stride) {
- av1_fdct8x8_1_c(input, final_output, stride);
-}
-
-void av1_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
- int stride) {
- av1_fdct16x16_c(input, output, stride);
-}
-
-void av1_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
- int stride) {
- av1_fdct16x16_1_c(input, output, stride);
-}
-
-void av1_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
- av1_fdct32x32_c(input, out, stride);
-}
-
-void av1_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
- int stride) {
- av1_fdct32x32_rd_c(input, out, stride);
-}
-
-void av1_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,
- int stride) {
- av1_fdct32x32_1_c(input, out, stride);
-}
-#endif // CONFIG_AOM_HIGHBITDEPTH
diff --git a/av1/common/av1_fwd_txfm.h b/av1/common/av1_fwd_txfm.h
deleted file mode 100644
index db763e5..0000000
--- a/av1/common/av1_fwd_txfm.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_COMMON_AV1_FWD_TXFM_H_
-#define AV1_COMMON_AV1_FWD_TXFM_H_
-
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/fwd_txfm.h"
-
-void av1_fdct32(const tran_high_t *input, tran_high_t *output, int round);
-#endif // AV1_COMMON_AV1_FWD_TXFM_H_
diff --git a/av1/common/av1_inv_txfm.c b/av1/common/av1_inv_txfm.c
deleted file mode 100644
index 4b2f061..0000000
--- a/av1/common/av1_inv_txfm.c
+++ /dev/null
@@ -1,2468 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <math.h>
-#include <string.h>
-
-#include "./av1_rtcd.h"
-#include "av1/common/av1_inv_txfm.h"
-
-void av1_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
- 0.5 shifts per pixel. */
- int i;
- tran_low_t output[16];
- tran_high_t a1, b1, c1, d1, e1;
- const tran_low_t *ip = input;
- tran_low_t *op = output;
-
- for (i = 0; i < 4; i++) {
- a1 = ip[0] >> UNIT_QUANT_SHIFT;
- c1 = ip[1] >> UNIT_QUANT_SHIFT;
- d1 = ip[2] >> UNIT_QUANT_SHIFT;
- b1 = ip[3] >> UNIT_QUANT_SHIFT;
- a1 += c1;
- d1 -= b1;
- e1 = (a1 - d1) >> 1;
- b1 = e1 - b1;
- c1 = e1 - c1;
- a1 -= b1;
- d1 += c1;
- op[0] = WRAPLOW(a1);
- op[1] = WRAPLOW(b1);
- op[2] = WRAPLOW(c1);
- op[3] = WRAPLOW(d1);
- ip += 4;
- op += 4;
- }
-
- ip = output;
- for (i = 0; i < 4; i++) {
- a1 = ip[4 * 0];
- c1 = ip[4 * 1];
- d1 = ip[4 * 2];
- b1 = ip[4 * 3];
- a1 += c1;
- d1 -= b1;
- e1 = (a1 - d1) >> 1;
- b1 = e1 - b1;
- c1 = e1 - c1;
- a1 -= b1;
- d1 += c1;
- dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
- dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
- dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
- dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
-
- ip++;
- dest++;
- }
-}
-
-void av1_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
- int i;
- tran_high_t a1, e1;
- tran_low_t tmp[4];
- const tran_low_t *ip = in;
- tran_low_t *op = tmp;
-
- a1 = ip[0] >> UNIT_QUANT_SHIFT;
- e1 = a1 >> 1;
- a1 -= e1;
- op[0] = WRAPLOW(a1);
- op[1] = op[2] = op[3] = WRAPLOW(e1);
-
- ip = tmp;
- for (i = 0; i < 4; i++) {
- e1 = ip[0] >> 1;
- a1 = ip[0] - e1;
- dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
- dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
- dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
- dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
- ip++;
- dest++;
- }
-}
-
-void av1_idct4_c(const tran_low_t *input, tran_low_t *output) {
- tran_low_t step[4];
- tran_high_t temp1, temp2;
- // stage 1
- temp1 = (input[0] + input[2]) * cospi_16_64;
- temp2 = (input[0] - input[2]) * cospi_16_64;
- step[0] = WRAPLOW(dct_const_round_shift(temp1));
- step[1] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
- temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
- step[2] = WRAPLOW(dct_const_round_shift(temp1));
- step[3] = WRAPLOW(dct_const_round_shift(temp2));
-
- // stage 2
- output[0] = WRAPLOW(step[0] + step[3]);
- output[1] = WRAPLOW(step[1] + step[2]);
- output[2] = WRAPLOW(step[1] - step[2]);
- output[3] = WRAPLOW(step[0] - step[3]);
-}
-
-void av1_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- tran_low_t out[4 * 4];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[4], temp_out[4];
-
- // Rows
- for (i = 0; i < 4; ++i) {
- av1_idct4_c(input, outptr);
- input += 4;
- outptr += 4;
- }
-
- // Columns
- for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
- av1_idct4_c(temp_in, temp_out);
- for (j = 0; j < 4; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 4));
- }
- }
-}
-
-void av1_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
- int i;
- tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
- a1 = ROUND_POWER_OF_TWO(out, 4);
-
- for (i = 0; i < 4; i++) {
- dest[0] = clip_pixel_add(dest[0], a1);
- dest[1] = clip_pixel_add(dest[1], a1);
- dest[2] = clip_pixel_add(dest[2], a1);
- dest[3] = clip_pixel_add(dest[3], a1);
- dest += dest_stride;
- }
-}
-
-void av1_idct8_c(const tran_low_t *input, tran_low_t *output) {
- tran_low_t step1[8], step2[8];
- tran_high_t temp1, temp2;
- // stage 1
- step1[0] = input[0];
- step1[2] = input[4];
- step1[1] = input[2];
- step1[3] = input[6];
- temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
- temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
- step1[4] = WRAPLOW(dct_const_round_shift(temp1));
- step1[7] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
- temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1));
- step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
- // stage 2
- temp1 = (step1[0] + step1[2]) * cospi_16_64;
- temp2 = (step1[0] - step1[2]) * cospi_16_64;
- step2[0] = WRAPLOW(dct_const_round_shift(temp1));
- step2[1] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
- temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = WRAPLOW(dct_const_round_shift(temp1));
- step2[3] = WRAPLOW(dct_const_round_shift(temp2));
- step2[4] = WRAPLOW(step1[4] + step1[5]);
- step2[5] = WRAPLOW(step1[4] - step1[5]);
- step2[6] = WRAPLOW(-step1[6] + step1[7]);
- step2[7] = WRAPLOW(step1[6] + step1[7]);
-
- // stage 3
- step1[0] = WRAPLOW(step2[0] + step2[3]);
- step1[1] = WRAPLOW(step2[1] + step2[2]);
- step1[2] = WRAPLOW(step2[1] - step2[2]);
- step1[3] = WRAPLOW(step2[0] - step2[3]);
- step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1));
- step1[6] = WRAPLOW(dct_const_round_shift(temp2));
- step1[7] = step2[7];
-
- // stage 4
- output[0] = WRAPLOW(step1[0] + step1[7]);
- output[1] = WRAPLOW(step1[1] + step1[6]);
- output[2] = WRAPLOW(step1[2] + step1[5]);
- output[3] = WRAPLOW(step1[3] + step1[4]);
- output[4] = WRAPLOW(step1[3] - step1[4]);
- output[5] = WRAPLOW(step1[2] - step1[5]);
- output[6] = WRAPLOW(step1[1] - step1[6]);
- output[7] = WRAPLOW(step1[0] - step1[7]);
-}
-
-void av1_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- tran_low_t out[8 * 8];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[8], temp_out[8];
-
- // First transform rows
- for (i = 0; i < 8; ++i) {
- av1_idct8_c(input, outptr);
- input += 8;
- outptr += 8;
- }
-
- // Then transform columns
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- av1_idct8_c(temp_in, temp_out);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 5));
- }
- }
-}
-
-void av1_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- int i, j;
- tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
- a1 = ROUND_POWER_OF_TWO(out, 5);
- for (j = 0; j < 8; ++j) {
- for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
- dest += stride;
- }
-}
-
-void av1_iadst4_c(const tran_low_t *input, tran_low_t *output) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
- tran_low_t x0 = input[0];
- tran_low_t x1 = input[1];
- tran_low_t x2 = input[2];
- tran_low_t x3 = input[3];
-
- if (!(x0 | x1 | x2 | x3)) {
- output[0] = output[1] = output[2] = output[3] = 0;
- return;
- }
-
- s0 = sinpi_1_9 * x0;
- s1 = sinpi_2_9 * x0;
- s2 = sinpi_3_9 * x1;
- s3 = sinpi_4_9 * x2;
- s4 = sinpi_1_9 * x2;
- s5 = sinpi_2_9 * x3;
- s6 = sinpi_4_9 * x3;
- s7 = WRAPLOW(x0 - x2 + x3);
-
- s0 = s0 + s3 + s5;
- s1 = s1 - s4 - s6;
- s3 = s2;
- s2 = sinpi_3_9 * s7;
-
- // 1-D transform scaling factor is sqrt(2).
- // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
- // + 1b (addition) = 29b.
- // Hence the output bit depth is 15b.
- output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
- output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
- output[2] = WRAPLOW(dct_const_round_shift(s2));
- output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
-}
-
-void av1_iadst8_c(const tran_low_t *input, tran_low_t *output) {
- int s0, s1, s2, s3, s4, s5, s6, s7;
-
- tran_high_t x0 = input[7];
- tran_high_t x1 = input[0];
- tran_high_t x2 = input[5];
- tran_high_t x3 = input[2];
- tran_high_t x4 = input[3];
- tran_high_t x5 = input[4];
- tran_high_t x6 = input[1];
- tran_high_t x7 = input[6];
-
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
- output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
- output[6] = output[7] = 0;
- return;
- }
-
- // stage 1
- s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
- s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
- s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
- s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
- s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
- s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
- s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
- s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
-
- x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
- x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
- x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
- x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
- x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
- x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
- x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
- x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
-
- // stage 2
- s0 = (int)x0;
- s1 = (int)x1;
- s2 = (int)x2;
- s3 = (int)x3;
- s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
- s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
- s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
- s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
-
- x0 = WRAPLOW(s0 + s2);
- x1 = WRAPLOW(s1 + s3);
- x2 = WRAPLOW(s0 - s2);
- x3 = WRAPLOW(s1 - s3);
- x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
- x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
- x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
- x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
-
- // stage 3
- s2 = (int)(cospi_16_64 * (x2 + x3));
- s3 = (int)(cospi_16_64 * (x2 - x3));
- s6 = (int)(cospi_16_64 * (x6 + x7));
- s7 = (int)(cospi_16_64 * (x6 - x7));
-
- x2 = WRAPLOW(dct_const_round_shift(s2));
- x3 = WRAPLOW(dct_const_round_shift(s3));
- x6 = WRAPLOW(dct_const_round_shift(s6));
- x7 = WRAPLOW(dct_const_round_shift(s7));
-
- output[0] = WRAPLOW(x0);
- output[1] = WRAPLOW(-x4);
- output[2] = WRAPLOW(x6);
- output[3] = WRAPLOW(-x2);
- output[4] = WRAPLOW(x3);
- output[5] = WRAPLOW(-x7);
- output[6] = WRAPLOW(x5);
- output[7] = WRAPLOW(-x1);
-}
-
-void av1_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- tran_low_t out[8 * 8] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[8], temp_out[8];
-
- // First transform rows
- // only first 4 row has non-zero coefs
- for (i = 0; i < 4; ++i) {
- av1_idct8_c(input, outptr);
- input += 8;
- outptr += 8;
- }
-
- // Then transform columns
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- av1_idct8_c(temp_in, temp_out);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 5));
- }
- }
-}
-
-void av1_idct16_c(const tran_low_t *input, tran_low_t *output) {
- tran_low_t step1[16], step2[16];
- tran_high_t temp1, temp2;
-
- // stage 1
- step1[0] = input[0 / 2];
- step1[1] = input[16 / 2];
- step1[2] = input[8 / 2];
- step1[3] = input[24 / 2];
- step1[4] = input[4 / 2];
- step1[5] = input[20 / 2];
- step1[6] = input[12 / 2];
- step1[7] = input[28 / 2];
- step1[8] = input[2 / 2];
- step1[9] = input[18 / 2];
- step1[10] = input[10 / 2];
- step1[11] = input[26 / 2];
- step1[12] = input[6 / 2];
- step1[13] = input[22 / 2];
- step1[14] = input[14 / 2];
- step1[15] = input[30 / 2];
-
- // stage 2
- step2[0] = step1[0];
- step2[1] = step1[1];
- step2[2] = step1[2];
- step2[3] = step1[3];
- step2[4] = step1[4];
- step2[5] = step1[5];
- step2[6] = step1[6];
- step2[7] = step1[7];
-
- temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
- temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = WRAPLOW(dct_const_round_shift(temp1));
- step2[15] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
- temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1));
- step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
- temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1));
- step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
- temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1));
- step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-
- // stage 3
- step1[0] = step2[0];
- step1[1] = step2[1];
- step1[2] = step2[2];
- step1[3] = step2[3];
-
- temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
- temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = WRAPLOW(dct_const_round_shift(temp1));
- step1[7] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
- temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1));
- step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
- step1[8] = WRAPLOW(step2[8] + step2[9]);
- step1[9] = WRAPLOW(step2[8] - step2[9]);
- step1[10] = WRAPLOW(-step2[10] + step2[11]);
- step1[11] = WRAPLOW(step2[10] + step2[11]);
- step1[12] = WRAPLOW(step2[12] + step2[13]);
- step1[13] = WRAPLOW(step2[12] - step2[13]);
- step1[14] = WRAPLOW(-step2[14] + step2[15]);
- step1[15] = WRAPLOW(step2[14] + step2[15]);
-
- // stage 4
- temp1 = (step1[0] + step1[1]) * cospi_16_64;
- temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = WRAPLOW(dct_const_round_shift(temp1));
- step2[1] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
- temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = WRAPLOW(dct_const_round_shift(temp1));
- step2[3] = WRAPLOW(dct_const_round_shift(temp2));
- step2[4] = WRAPLOW(step1[4] + step1[5]);
- step2[5] = WRAPLOW(step1[4] - step1[5]);
- step2[6] = WRAPLOW(-step1[6] + step1[7]);
- step2[7] = WRAPLOW(step1[6] + step1[7]);
-
- step2[8] = step1[8];
- step2[15] = step1[15];
- temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
- temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1));
- step2[14] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
- temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1));
- step2[13] = WRAPLOW(dct_const_round_shift(temp2));
- step2[11] = step1[11];
- step2[12] = step1[12];
-
- // stage 5
- step1[0] = WRAPLOW(step2[0] + step2[3]);
- step1[1] = WRAPLOW(step2[1] + step2[2]);
- step1[2] = WRAPLOW(step2[1] - step2[2]);
- step1[3] = WRAPLOW(step2[0] - step2[3]);
- step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1));
- step1[6] = WRAPLOW(dct_const_round_shift(temp2));
- step1[7] = step2[7];
-
- step1[8] = WRAPLOW(step2[8] + step2[11]);
- step1[9] = WRAPLOW(step2[9] + step2[10]);
- step1[10] = WRAPLOW(step2[9] - step2[10]);
- step1[11] = WRAPLOW(step2[8] - step2[11]);
- step1[12] = WRAPLOW(-step2[12] + step2[15]);
- step1[13] = WRAPLOW(-step2[13] + step2[14]);
- step1[14] = WRAPLOW(step2[13] + step2[14]);
- step1[15] = WRAPLOW(step2[12] + step2[15]);
-
- // stage 6
- step2[0] = WRAPLOW(step1[0] + step1[7]);
- step2[1] = WRAPLOW(step1[1] + step1[6]);
- step2[2] = WRAPLOW(step1[2] + step1[5]);
- step2[3] = WRAPLOW(step1[3] + step1[4]);
- step2[4] = WRAPLOW(step1[3] - step1[4]);
- step2[5] = WRAPLOW(step1[2] - step1[5]);
- step2[6] = WRAPLOW(step1[1] - step1[6]);
- step2[7] = WRAPLOW(step1[0] - step1[7]);
- step2[8] = step1[8];
- step2[9] = step1[9];
- temp1 = (-step1[10] + step1[13]) * cospi_16_64;
- temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1));
- step2[13] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = (-step1[11] + step1[12]) * cospi_16_64;
- temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1));
- step2[12] = WRAPLOW(dct_const_round_shift(temp2));
- step2[14] = step1[14];
- step2[15] = step1[15];
-
- // stage 7
- output[0] = WRAPLOW(step2[0] + step2[15]);
- output[1] = WRAPLOW(step2[1] + step2[14]);
- output[2] = WRAPLOW(step2[2] + step2[13]);
- output[3] = WRAPLOW(step2[3] + step2[12]);
- output[4] = WRAPLOW(step2[4] + step2[11]);
- output[5] = WRAPLOW(step2[5] + step2[10]);
- output[6] = WRAPLOW(step2[6] + step2[9]);
- output[7] = WRAPLOW(step2[7] + step2[8]);
- output[8] = WRAPLOW(step2[7] - step2[8]);
- output[9] = WRAPLOW(step2[6] - step2[9]);
- output[10] = WRAPLOW(step2[5] - step2[10]);
- output[11] = WRAPLOW(step2[4] - step2[11]);
- output[12] = WRAPLOW(step2[3] - step2[12]);
- output[13] = WRAPLOW(step2[2] - step2[13]);
- output[14] = WRAPLOW(step2[1] - step2[14]);
- output[15] = WRAPLOW(step2[0] - step2[15]);
-}
-
-void av1_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
- int stride) {
- tran_low_t out[16 * 16];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[16], temp_out[16];
-
- // First transform rows
- for (i = 0; i < 16; ++i) {
- av1_idct16_c(input, outptr);
- input += 16;
- outptr += 16;
- }
-
- // Then transform columns
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- av1_idct16_c(temp_in, temp_out);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 6));
- }
- }
-}
-
-void av1_iadst16_c(const tran_low_t *input, tran_low_t *output) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
- tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
- tran_high_t x0 = input[15];
- tran_high_t x1 = input[0];
- tran_high_t x2 = input[13];
- tran_high_t x3 = input[2];
- tran_high_t x4 = input[11];
- tran_high_t x5 = input[4];
- tran_high_t x6 = input[9];
- tran_high_t x7 = input[6];
- tran_high_t x8 = input[7];
- tran_high_t x9 = input[8];
- tran_high_t x10 = input[5];
- tran_high_t x11 = input[10];
- tran_high_t x12 = input[3];
- tran_high_t x13 = input[12];
- tran_high_t x14 = input[1];
- tran_high_t x15 = input[14];
-
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
- x13 | x14 | x15)) {
- output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
- output[6] = output[7] = output[8] = output[9] = output[10] =
- output[11] = output[12] = output[13] = output[14] = output[15] = 0;
- return;
- }
-
- // stage 1
- s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
- s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
- s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
- s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
- s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
- s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
- s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
- s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
- s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
- s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
- s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
- s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
- s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
- s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
- s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
- s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
- x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
- x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
- x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
- x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
- x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
- x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
- x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
- x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
- x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
- x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
- x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
- x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
- x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
- x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
- x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
- x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
-
- // stage 2
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = x4;
- s5 = x5;
- s6 = x6;
- s7 = x7;
- s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
- s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
- s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
- s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
- s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
- s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
- s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
- s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
- x0 = WRAPLOW(s0 + s4);
- x1 = WRAPLOW(s1 + s5);
- x2 = WRAPLOW(s2 + s6);
- x3 = WRAPLOW(s3 + s7);
- x4 = WRAPLOW(s0 - s4);
- x5 = WRAPLOW(s1 - s5);
- x6 = WRAPLOW(s2 - s6);
- x7 = WRAPLOW(s3 - s7);
- x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
- x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
- x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
- x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
- x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
- x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
- x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
- x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
-
- // stage 3
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
- s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
- s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
- s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
- s8 = x8;
- s9 = x9;
- s10 = x10;
- s11 = x11;
- s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
- s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
- s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
- s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
- x0 = WRAPLOW(s0 + s2);
- x1 = WRAPLOW(s1 + s3);
- x2 = WRAPLOW(s0 - s2);
- x3 = WRAPLOW(s1 - s3);
- x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
- x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
- x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
- x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
- x8 = WRAPLOW(s8 + s10);
- x9 = WRAPLOW(s9 + s11);
- x10 = WRAPLOW(s8 - s10);
- x11 = WRAPLOW(s9 - s11);
- x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
- x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
- x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
- x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
-
- // stage 4
- s2 = (-cospi_16_64) * (x2 + x3);
- s3 = cospi_16_64 * (x2 - x3);
- s6 = cospi_16_64 * (x6 + x7);
- s7 = cospi_16_64 * (-x6 + x7);
- s10 = cospi_16_64 * (x10 + x11);
- s11 = cospi_16_64 * (-x10 + x11);
- s14 = (-cospi_16_64) * (x14 + x15);
- s15 = cospi_16_64 * (x14 - x15);
-
- x2 = WRAPLOW(dct_const_round_shift(s2));
- x3 = WRAPLOW(dct_const_round_shift(s3));
- x6 = WRAPLOW(dct_const_round_shift(s6));
- x7 = WRAPLOW(dct_const_round_shift(s7));
- x10 = WRAPLOW(dct_const_round_shift(s10));
- x11 = WRAPLOW(dct_const_round_shift(s11));
- x14 = WRAPLOW(dct_const_round_shift(s14));
- x15 = WRAPLOW(dct_const_round_shift(s15));
-
- output[0] = WRAPLOW(x0);
- output[1] = WRAPLOW(-x8);
- output[2] = WRAPLOW(x12);
- output[3] = WRAPLOW(-x4);
- output[4] = WRAPLOW(x6);
- output[5] = WRAPLOW(x14);
- output[6] = WRAPLOW(x10);
- output[7] = WRAPLOW(x2);
- output[8] = WRAPLOW(x3);
- output[9] = WRAPLOW(x11);
- output[10] = WRAPLOW(x15);
- output[11] = WRAPLOW(x7);
- output[12] = WRAPLOW(x5);
- output[13] = WRAPLOW(-x13);
- output[14] = WRAPLOW(x9);
- output[15] = WRAPLOW(-x1);
-}
-
-void av1_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
- int stride) {
- tran_low_t out[16 * 16] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[16], temp_out[16];
-
- // First transform rows. Since all non-zero dct coefficients are in
- // upper-left 4x4 area, we only need to calculate first 4 rows here.
- for (i = 0; i < 4; ++i) {
- av1_idct16_c(input, outptr);
- input += 16;
- outptr += 16;
- }
-
- // Then transform columns
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- av1_idct16_c(temp_in, temp_out);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 6));
- }
- }
-}
-
-void av1_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- int i, j;
- tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
- a1 = ROUND_POWER_OF_TWO(out, 6);
- for (j = 0; j < 16; ++j) {
- for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
- dest += stride;
- }
-}
-
-void av1_idct32_c(const tran_low_t *input, tran_low_t *output) {
- tran_low_t step1[32], step2[32];
- tran_high_t temp1, temp2;
-
- // stage 1
- step1[0] = input[0];
- step1[1] = input[16];
- step1[2] = input[8];
- step1[3] = input[24];
- step1[4] = input[4];
- step1[5] = input[20];
- step1[6] = input[12];
- step1[7] = input[28];
- step1[8] = input[2];
- step1[9] = input[18];
- step1[10] = input[10];
- step1[11] = input[26];
- step1[12] = input[6];
- step1[13] = input[22];
- step1[14] = input[14];
- step1[15] = input[30];
-
- temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
- temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
- step1[16] = WRAPLOW(dct_const_round_shift(temp1));
- step1[31] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
- temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
- step1[17] = WRAPLOW(dct_const_round_shift(temp1));
- step1[30] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
- temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
- step1[18] = WRAPLOW(dct_const_round_shift(temp1));
- step1[29] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
- temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
- step1[19] = WRAPLOW(dct_const_round_shift(temp1));
- step1[28] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
- temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
- step1[20] = WRAPLOW(dct_const_round_shift(temp1));
- step1[27] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
- temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1));
- step1[26] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
- temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
- step1[22] = WRAPLOW(dct_const_round_shift(temp1));
- step1[25] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
- temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
- step1[23] = WRAPLOW(dct_const_round_shift(temp1));
- step1[24] = WRAPLOW(dct_const_round_shift(temp2));
-
- // stage 2
- step2[0] = step1[0];
- step2[1] = step1[1];
- step2[2] = step1[2];
- step2[3] = step1[3];
- step2[4] = step1[4];
- step2[5] = step1[5];
- step2[6] = step1[6];
- step2[7] = step1[7];
-
- temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
- temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = WRAPLOW(dct_const_round_shift(temp1));
- step2[15] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
- temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1));
- step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
- temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1));
- step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
- temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1));
- step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-
- step2[16] = WRAPLOW(step1[16] + step1[17]);
- step2[17] = WRAPLOW(step1[16] - step1[17]);
- step2[18] = WRAPLOW(-step1[18] + step1[19]);
- step2[19] = WRAPLOW(step1[18] + step1[19]);
- step2[20] = WRAPLOW(step1[20] + step1[21]);
- step2[21] = WRAPLOW(step1[20] - step1[21]);
- step2[22] = WRAPLOW(-step1[22] + step1[23]);
- step2[23] = WRAPLOW(step1[22] + step1[23]);
- step2[24] = WRAPLOW(step1[24] + step1[25]);
- step2[25] = WRAPLOW(step1[24] - step1[25]);
- step2[26] = WRAPLOW(-step1[26] + step1[27]);
- step2[27] = WRAPLOW(step1[26] + step1[27]);
- step2[28] = WRAPLOW(step1[28] + step1[29]);
- step2[29] = WRAPLOW(step1[28] - step1[29]);
- step2[30] = WRAPLOW(-step1[30] + step1[31]);
- step2[31] = WRAPLOW(step1[30] + step1[31]);
-
- // stage 3
- step1[0] = step2[0];
- step1[1] = step2[1];
- step1[2] = step2[2];
- step1[3] = step2[3];
-
- temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
- temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = WRAPLOW(dct_const_round_shift(temp1));
- step1[7] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
- temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1));
- step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
- step1[8] = WRAPLOW(step2[8] + step2[9]);
- step1[9] = WRAPLOW(step2[8] - step2[9]);
- step1[10] = WRAPLOW(-step2[10] + step2[11]);
- step1[11] = WRAPLOW(step2[10] + step2[11]);
- step1[12] = WRAPLOW(step2[12] + step2[13]);
- step1[13] = WRAPLOW(step2[12] - step2[13]);
- step1[14] = WRAPLOW(-step2[14] + step2[15]);
- step1[15] = WRAPLOW(step2[14] + step2[15]);
-
- step1[16] = step2[16];
- step1[31] = step2[31];
- temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
- temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
- step1[17] = WRAPLOW(dct_const_round_shift(temp1));
- step1[30] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
- temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
- step1[18] = WRAPLOW(dct_const_round_shift(temp1));
- step1[29] = WRAPLOW(dct_const_round_shift(temp2));
- step1[19] = step2[19];
- step1[20] = step2[20];
- temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
- temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1));
- step1[26] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
- temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
- step1[22] = WRAPLOW(dct_const_round_shift(temp1));
- step1[25] = WRAPLOW(dct_const_round_shift(temp2));
- step1[23] = step2[23];
- step1[24] = step2[24];
- step1[27] = step2[27];
- step1[28] = step2[28];
-
- // stage 4
- temp1 = (step1[0] + step1[1]) * cospi_16_64;
- temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = WRAPLOW(dct_const_round_shift(temp1));
- step2[1] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
- temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = WRAPLOW(dct_const_round_shift(temp1));
- step2[3] = WRAPLOW(dct_const_round_shift(temp2));
- step2[4] = WRAPLOW(step1[4] + step1[5]);
- step2[5] = WRAPLOW(step1[4] - step1[5]);
- step2[6] = WRAPLOW(-step1[6] + step1[7]);
- step2[7] = WRAPLOW(step1[6] + step1[7]);
-
- step2[8] = step1[8];
- step2[15] = step1[15];
- temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
- temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1));
- step2[14] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
- temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1));
- step2[13] = WRAPLOW(dct_const_round_shift(temp2));
- step2[11] = step1[11];
- step2[12] = step1[12];
-
- step2[16] = WRAPLOW(step1[16] + step1[19]);
- step2[17] = WRAPLOW(step1[17] + step1[18]);
- step2[18] = WRAPLOW(step1[17] - step1[18]);
- step2[19] = WRAPLOW(step1[16] - step1[19]);
- step2[20] = WRAPLOW(-step1[20] + step1[23]);
- step2[21] = WRAPLOW(-step1[21] + step1[22]);
- step2[22] = WRAPLOW(step1[21] + step1[22]);
- step2[23] = WRAPLOW(step1[20] + step1[23]);
-
- step2[24] = WRAPLOW(step1[24] + step1[27]);
- step2[25] = WRAPLOW(step1[25] + step1[26]);
- step2[26] = WRAPLOW(step1[25] - step1[26]);
- step2[27] = WRAPLOW(step1[24] - step1[27]);
- step2[28] = WRAPLOW(-step1[28] + step1[31]);
- step2[29] = WRAPLOW(-step1[29] + step1[30]);
- step2[30] = WRAPLOW(step1[29] + step1[30]);
- step2[31] = WRAPLOW(step1[28] + step1[31]);
-
- // stage 5
- step1[0] = WRAPLOW(step2[0] + step2[3]);
- step1[1] = WRAPLOW(step2[1] + step2[2]);
- step1[2] = WRAPLOW(step2[1] - step2[2]);
- step1[3] = WRAPLOW(step2[0] - step2[3]);
- step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1));
- step1[6] = WRAPLOW(dct_const_round_shift(temp2));
- step1[7] = step2[7];
-
- step1[8] = WRAPLOW(step2[8] + step2[11]);
- step1[9] = WRAPLOW(step2[9] + step2[10]);
- step1[10] = WRAPLOW(step2[9] - step2[10]);
- step1[11] = WRAPLOW(step2[8] - step2[11]);
- step1[12] = WRAPLOW(-step2[12] + step2[15]);
- step1[13] = WRAPLOW(-step2[13] + step2[14]);
- step1[14] = WRAPLOW(step2[13] + step2[14]);
- step1[15] = WRAPLOW(step2[12] + step2[15]);
-
- step1[16] = step2[16];
- step1[17] = step2[17];
- temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
- temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
- step1[18] = WRAPLOW(dct_const_round_shift(temp1));
- step1[29] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
- temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
- step1[19] = WRAPLOW(dct_const_round_shift(temp1));
- step1[28] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
- temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
- step1[20] = WRAPLOW(dct_const_round_shift(temp1));
- step1[27] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
- temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1));
- step1[26] = WRAPLOW(dct_const_round_shift(temp2));
- step1[22] = step2[22];
- step1[23] = step2[23];
- step1[24] = step2[24];
- step1[25] = step2[25];
- step1[30] = step2[30];
- step1[31] = step2[31];
-
- // stage 6
- step2[0] = WRAPLOW(step1[0] + step1[7]);
- step2[1] = WRAPLOW(step1[1] + step1[6]);
- step2[2] = WRAPLOW(step1[2] + step1[5]);
- step2[3] = WRAPLOW(step1[3] + step1[4]);
- step2[4] = WRAPLOW(step1[3] - step1[4]);
- step2[5] = WRAPLOW(step1[2] - step1[5]);
- step2[6] = WRAPLOW(step1[1] - step1[6]);
- step2[7] = WRAPLOW(step1[0] - step1[7]);
- step2[8] = step1[8];
- step2[9] = step1[9];
- temp1 = (-step1[10] + step1[13]) * cospi_16_64;
- temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1));
- step2[13] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = (-step1[11] + step1[12]) * cospi_16_64;
- temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1));
- step2[12] = WRAPLOW(dct_const_round_shift(temp2));
- step2[14] = step1[14];
- step2[15] = step1[15];
-
- step2[16] = WRAPLOW(step1[16] + step1[23]);
- step2[17] = WRAPLOW(step1[17] + step1[22]);
- step2[18] = WRAPLOW(step1[18] + step1[21]);
- step2[19] = WRAPLOW(step1[19] + step1[20]);
- step2[20] = WRAPLOW(step1[19] - step1[20]);
- step2[21] = WRAPLOW(step1[18] - step1[21]);
- step2[22] = WRAPLOW(step1[17] - step1[22]);
- step2[23] = WRAPLOW(step1[16] - step1[23]);
-
- step2[24] = WRAPLOW(-step1[24] + step1[31]);
- step2[25] = WRAPLOW(-step1[25] + step1[30]);
- step2[26] = WRAPLOW(-step1[26] + step1[29]);
- step2[27] = WRAPLOW(-step1[27] + step1[28]);
- step2[28] = WRAPLOW(step1[27] + step1[28]);
- step2[29] = WRAPLOW(step1[26] + step1[29]);
- step2[30] = WRAPLOW(step1[25] + step1[30]);
- step2[31] = WRAPLOW(step1[24] + step1[31]);
-
- // stage 7
- step1[0] = WRAPLOW(step2[0] + step2[15]);
- step1[1] = WRAPLOW(step2[1] + step2[14]);
- step1[2] = WRAPLOW(step2[2] + step2[13]);
- step1[3] = WRAPLOW(step2[3] + step2[12]);
- step1[4] = WRAPLOW(step2[4] + step2[11]);
- step1[5] = WRAPLOW(step2[5] + step2[10]);
- step1[6] = WRAPLOW(step2[6] + step2[9]);
- step1[7] = WRAPLOW(step2[7] + step2[8]);
- step1[8] = WRAPLOW(step2[7] - step2[8]);
- step1[9] = WRAPLOW(step2[6] - step2[9]);
- step1[10] = WRAPLOW(step2[5] - step2[10]);
- step1[11] = WRAPLOW(step2[4] - step2[11]);
- step1[12] = WRAPLOW(step2[3] - step2[12]);
- step1[13] = WRAPLOW(step2[2] - step2[13]);
- step1[14] = WRAPLOW(step2[1] - step2[14]);
- step1[15] = WRAPLOW(step2[0] - step2[15]);
-
- step1[16] = step2[16];
- step1[17] = step2[17];
- step1[18] = step2[18];
- step1[19] = step2[19];
- temp1 = (-step2[20] + step2[27]) * cospi_16_64;
- temp2 = (step2[20] + step2[27]) * cospi_16_64;
- step1[20] = WRAPLOW(dct_const_round_shift(temp1));
- step1[27] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = (-step2[21] + step2[26]) * cospi_16_64;
- temp2 = (step2[21] + step2[26]) * cospi_16_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1));
- step1[26] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = (-step2[22] + step2[25]) * cospi_16_64;
- temp2 = (step2[22] + step2[25]) * cospi_16_64;
- step1[22] = WRAPLOW(dct_const_round_shift(temp1));
- step1[25] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = (-step2[23] + step2[24]) * cospi_16_64;
- temp2 = (step2[23] + step2[24]) * cospi_16_64;
- step1[23] = WRAPLOW(dct_const_round_shift(temp1));
- step1[24] = WRAPLOW(dct_const_round_shift(temp2));
- step1[28] = step2[28];
- step1[29] = step2[29];
- step1[30] = step2[30];
- step1[31] = step2[31];
-
- // final stage
- output[0] = WRAPLOW(step1[0] + step1[31]);
- output[1] = WRAPLOW(step1[1] + step1[30]);
- output[2] = WRAPLOW(step1[2] + step1[29]);
- output[3] = WRAPLOW(step1[3] + step1[28]);
- output[4] = WRAPLOW(step1[4] + step1[27]);
- output[5] = WRAPLOW(step1[5] + step1[26]);
- output[6] = WRAPLOW(step1[6] + step1[25]);
- output[7] = WRAPLOW(step1[7] + step1[24]);
- output[8] = WRAPLOW(step1[8] + step1[23]);
- output[9] = WRAPLOW(step1[9] + step1[22]);
- output[10] = WRAPLOW(step1[10] + step1[21]);
- output[11] = WRAPLOW(step1[11] + step1[20]);
- output[12] = WRAPLOW(step1[12] + step1[19]);
- output[13] = WRAPLOW(step1[13] + step1[18]);
- output[14] = WRAPLOW(step1[14] + step1[17]);
- output[15] = WRAPLOW(step1[15] + step1[16]);
- output[16] = WRAPLOW(step1[15] - step1[16]);
- output[17] = WRAPLOW(step1[14] - step1[17]);
- output[18] = WRAPLOW(step1[13] - step1[18]);
- output[19] = WRAPLOW(step1[12] - step1[19]);
- output[20] = WRAPLOW(step1[11] - step1[20]);
- output[21] = WRAPLOW(step1[10] - step1[21]);
- output[22] = WRAPLOW(step1[9] - step1[22]);
- output[23] = WRAPLOW(step1[8] - step1[23]);
- output[24] = WRAPLOW(step1[7] - step1[24]);
- output[25] = WRAPLOW(step1[6] - step1[25]);
- output[26] = WRAPLOW(step1[5] - step1[26]);
- output[27] = WRAPLOW(step1[4] - step1[27]);
- output[28] = WRAPLOW(step1[3] - step1[28]);
- output[29] = WRAPLOW(step1[2] - step1[29]);
- output[30] = WRAPLOW(step1[1] - step1[30]);
- output[31] = WRAPLOW(step1[0] - step1[31]);
-}
-
-void av1_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
- int stride) {
- tran_low_t out[32 * 32];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[32], temp_out[32];
-
- // Rows
- for (i = 0; i < 32; ++i) {
- int16_t zero_coeff[16];
- for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
- for (j = 0; j < 8; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
- for (j = 0; j < 4; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
- for (j = 0; j < 2; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
- if (zero_coeff[0] | zero_coeff[1])
- av1_idct32_c(input, outptr);
- else
- memset(outptr, 0, sizeof(tran_low_t) * 32);
- input += 32;
- outptr += 32;
- }
-
- // Columns
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
- av1_idct32_c(temp_in, temp_out);
- for (j = 0; j < 32; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 6));
- }
- }
-}
-
-void av1_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
- int stride) {
- tran_low_t out[32 * 32] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[32], temp_out[32];
-
- // Rows
- // only upper-left 8x8 has non-zero coeff
- for (i = 0; i < 8; ++i) {
- av1_idct32_c(input, outptr);
- input += 32;
- outptr += 32;
- }
-
- // Columns
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
- av1_idct32_c(temp_in, temp_out);
- for (j = 0; j < 32; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 6));
- }
- }
-}
-
-void av1_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- int i, j;
- tran_high_t a1;
-
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
- a1 = ROUND_POWER_OF_TWO(out, 6);
-
- for (j = 0; j < 32; ++j) {
- for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
- dest += stride;
- }
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
- 0.5 shifts per pixel. */
- int i;
- tran_low_t output[16];
- tran_high_t a1, b1, c1, d1, e1;
- const tran_low_t *ip = input;
- tran_low_t *op = output;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- for (i = 0; i < 4; i++) {
- a1 = ip[0] >> UNIT_QUANT_SHIFT;
- c1 = ip[1] >> UNIT_QUANT_SHIFT;
- d1 = ip[2] >> UNIT_QUANT_SHIFT;
- b1 = ip[3] >> UNIT_QUANT_SHIFT;
- a1 += c1;
- d1 -= b1;
- e1 = (a1 - d1) >> 1;
- b1 = e1 - b1;
- c1 = e1 - c1;
- a1 -= b1;
- d1 += c1;
- op[0] = HIGHBD_WRAPLOW(a1, bd);
- op[1] = HIGHBD_WRAPLOW(b1, bd);
- op[2] = HIGHBD_WRAPLOW(c1, bd);
- op[3] = HIGHBD_WRAPLOW(d1, bd);
- ip += 4;
- op += 4;
- }
-
- ip = output;
- for (i = 0; i < 4; i++) {
- a1 = ip[4 * 0];
- c1 = ip[4 * 1];
- d1 = ip[4 * 2];
- b1 = ip[4 * 3];
- a1 += c1;
- d1 -= b1;
- e1 = (a1 - d1) >> 1;
- b1 = e1 - b1;
- c1 = e1 - c1;
- a1 -= b1;
- d1 += c1;
- dest[stride * 0] =
- highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
- dest[stride * 1] =
- highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
- dest[stride * 2] =
- highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
- dest[stride * 3] =
- highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
- ip++;
- dest++;
- }
-}
-
-void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
- int dest_stride, int bd) {
- int i;
- tran_high_t a1, e1;
- tran_low_t tmp[4];
- const tran_low_t *ip = in;
- tran_low_t *op = tmp;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- (void)bd;
-
- a1 = ip[0] >> UNIT_QUANT_SHIFT;
- e1 = a1 >> 1;
- a1 -= e1;
- op[0] = HIGHBD_WRAPLOW(a1, bd);
- op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
-
- ip = tmp;
- for (i = 0; i < 4; i++) {
- e1 = ip[0] >> 1;
- a1 = ip[0] - e1;
- dest[dest_stride * 0] =
- highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
- dest[dest_stride * 1] =
- highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
- dest[dest_stride * 2] =
- highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
- dest[dest_stride * 3] =
- highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
- ip++;
- dest++;
- }
-}
-
-void av1_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_low_t step[4];
- tran_high_t temp1, temp2;
- (void)bd;
- // stage 1
- temp1 = (input[0] + input[2]) * cospi_16_64;
- temp2 = (input[0] - input[2]) * cospi_16_64;
- step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
- temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
- step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- // stage 2
- output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
- output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
- output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
- output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
-}
-
-void av1_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[4 * 4];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[4], temp_out[4];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // Rows
- for (i = 0; i < 4; ++i) {
- av1_highbd_idct4_c(input, outptr, bd);
- input += 4;
- outptr += 4;
- }
-
- // Columns
- for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
- av1_highbd_idct4_c(temp_in, temp_out, bd);
- for (j = 0; j < 4; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
- }
- }
-}
-
-void av1_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
- int dest_stride, int bd) {
- int i;
- tran_high_t a1;
- tran_low_t out =
- HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
- a1 = ROUND_POWER_OF_TWO(out, 4);
-
- for (i = 0; i < 4; i++) {
- dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
- dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
- dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
- dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
- dest += dest_stride;
- }
-}
-
-void av1_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_low_t step1[8], step2[8];
- tran_high_t temp1, temp2;
- // stage 1
- step1[0] = input[0];
- step1[2] = input[4];
- step1[1] = input[2];
- step1[3] = input[6];
- temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
- temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
- step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
- temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- // stage 2 & stage 3 - even half
- av1_highbd_idct4_c(step1, step1, bd);
-
- // stage 2 - odd half
- step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
- step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
- step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
- step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
-
- // stage 3 - odd half
- step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[7] = step2[7];
-
- // stage 4
- output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
- output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
- output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
- output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
- output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
- output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
- output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
- output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
-}
-
-void av1_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[8 * 8];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[8], temp_out[8];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // First transform rows.
- for (i = 0; i < 8; ++i) {
- av1_highbd_idct8_c(input, outptr, bd);
- input += 8;
- outptr += 8;
- }
-
- // Then transform columns.
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- av1_highbd_idct8_c(temp_in, temp_out, bd);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
- }
- }
-}
-
-void av1_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- int i, j;
- tran_high_t a1;
- tran_low_t out =
- HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
- a1 = ROUND_POWER_OF_TWO(out, 5);
- for (j = 0; j < 8; ++j) {
- for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
- dest += stride;
- }
-}
-
-void av1_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
- tran_low_t x0 = input[0];
- tran_low_t x1 = input[1];
- tran_low_t x2 = input[2];
- tran_low_t x3 = input[3];
- (void)bd;
-
- if (!(x0 | x1 | x2 | x3)) {
- memset(output, 0, 4 * sizeof(*output));
- return;
- }
-
- s0 = sinpi_1_9 * x0;
- s1 = sinpi_2_9 * x0;
- s2 = sinpi_3_9 * x1;
- s3 = sinpi_4_9 * x2;
- s4 = sinpi_1_9 * x2;
- s5 = sinpi_2_9 * x3;
- s6 = sinpi_4_9 * x3;
- s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
-
- s0 = s0 + s3 + s5;
- s1 = s1 - s4 - s6;
- s3 = s2;
- s2 = sinpi_3_9 * s7;
-
- // 1-D transform scaling factor is sqrt(2).
- // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
- // + 1b (addition) = 29b.
- // Hence the output bit depth is 15b.
- output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd);
- output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd);
- output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
- output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd);
-}
-
-void av1_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
- tran_low_t x0 = input[7];
- tran_low_t x1 = input[0];
- tran_low_t x2 = input[5];
- tran_low_t x3 = input[2];
- tran_low_t x4 = input[3];
- tran_low_t x5 = input[4];
- tran_low_t x6 = input[1];
- tran_low_t x7 = input[6];
- (void)bd;
-
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
- memset(output, 0, 8 * sizeof(*output));
- return;
- }
-
- // stage 1
- s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
- s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
- s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
- s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
- s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
- s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
- s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
- s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
-
- x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd);
- x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd);
- x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd);
- x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd);
- x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd);
- x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd);
-
- // stage 2
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
- s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
- s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
- s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
-
- x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
- x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
- x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
- x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
- x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
- x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
-
- // stage 3
- s2 = cospi_16_64 * (x2 + x3);
- s3 = cospi_16_64 * (x2 - x3);
- s6 = cospi_16_64 * (x6 + x7);
- s7 = cospi_16_64 * (x6 - x7);
-
- x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
- x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
-
- output[0] = HIGHBD_WRAPLOW(x0, bd);
- output[1] = HIGHBD_WRAPLOW(-x4, bd);
- output[2] = HIGHBD_WRAPLOW(x6, bd);
- output[3] = HIGHBD_WRAPLOW(-x2, bd);
- output[4] = HIGHBD_WRAPLOW(x3, bd);
- output[5] = HIGHBD_WRAPLOW(-x7, bd);
- output[6] = HIGHBD_WRAPLOW(x5, bd);
- output[7] = HIGHBD_WRAPLOW(-x1, bd);
-}
-
-void av1_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[8 * 8] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[8], temp_out[8];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // First transform rows.
- // Only first 4 row has non-zero coefs.
- for (i = 0; i < 4; ++i) {
- av1_highbd_idct8_c(input, outptr, bd);
- input += 8;
- outptr += 8;
- }
- // Then transform columns.
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- av1_highbd_idct8_c(temp_in, temp_out, bd);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
- }
- }
-}
-
-void av1_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_low_t step1[16], step2[16];
- tran_high_t temp1, temp2;
- (void)bd;
-
- // stage 1
- step1[0] = input[0 / 2];
- step1[1] = input[16 / 2];
- step1[2] = input[8 / 2];
- step1[3] = input[24 / 2];
- step1[4] = input[4 / 2];
- step1[5] = input[20 / 2];
- step1[6] = input[12 / 2];
- step1[7] = input[28 / 2];
- step1[8] = input[2 / 2];
- step1[9] = input[18 / 2];
- step1[10] = input[10 / 2];
- step1[11] = input[26 / 2];
- step1[12] = input[6 / 2];
- step1[13] = input[22 / 2];
- step1[14] = input[14 / 2];
- step1[15] = input[30 / 2];
-
- // stage 2
- step2[0] = step1[0];
- step2[1] = step1[1];
- step2[2] = step1[2];
- step2[3] = step1[3];
- step2[4] = step1[4];
- step2[5] = step1[5];
- step2[6] = step1[6];
- step2[7] = step1[7];
-
- temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
- temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
- temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
- temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
- temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- // stage 3
- step1[0] = step2[0];
- step1[1] = step2[1];
- step1[2] = step2[2];
- step1[3] = step2[3];
-
- temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
- temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
- temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
- step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
- step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
- step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
- step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
- step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
- step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
- step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
-
- // stage 4
- temp1 = (step1[0] + step1[1]) * cospi_16_64;
- temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
- temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
- step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
- step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
- step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
-
- step2[8] = step1[8];
- step2[15] = step1[15];
- temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
- temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
- temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step2[11] = step1[11];
- step2[12] = step1[12];
-
- // stage 5
- step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
- step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
- step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
- step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
- step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[7] = step2[7];
-
- step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
- step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
- step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
- step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
- step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
- step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
- step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
- step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
-
- // stage 6
- step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
- step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
- step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
- step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
- step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
- step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
- step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
- step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
- step2[8] = step1[8];
- step2[9] = step1[9];
- temp1 = (-step1[10] + step1[13]) * cospi_16_64;
- temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = (-step1[11] + step1[12]) * cospi_16_64;
- temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step2[14] = step1[14];
- step2[15] = step1[15];
-
- // stage 7
- output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
- output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
- output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
- output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
- output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
- output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
- output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
- output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
- output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
- output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
- output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
- output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
- output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
- output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
- output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
- output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
-}
-
-void av1_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[16 * 16];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[16], temp_out[16];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // First transform rows.
- for (i = 0; i < 16; ++i) {
- av1_highbd_idct16_c(input, outptr, bd);
- input += 16;
- outptr += 16;
- }
-
- // Then transform columns.
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- av1_highbd_idct16_c(temp_in, temp_out, bd);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
-}
-
-void av1_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
- tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
- tran_low_t x0 = input[15];
- tran_low_t x1 = input[0];
- tran_low_t x2 = input[13];
- tran_low_t x3 = input[2];
- tran_low_t x4 = input[11];
- tran_low_t x5 = input[4];
- tran_low_t x6 = input[9];
- tran_low_t x7 = input[6];
- tran_low_t x8 = input[7];
- tran_low_t x9 = input[8];
- tran_low_t x10 = input[5];
- tran_low_t x11 = input[10];
- tran_low_t x12 = input[3];
- tran_low_t x13 = input[12];
- tran_low_t x14 = input[1];
- tran_low_t x15 = input[14];
- (void)bd;
-
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
- x13 | x14 | x15)) {
- memset(output, 0, 16 * sizeof(*output));
- return;
- }
-
- // stage 1
- s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
- s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
- s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
- s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
- s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
- s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
- s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
- s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
- s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
- s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
- s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
- s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
- s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
- s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
- s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
- s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
- x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd);
- x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd);
- x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd);
- x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd);
- x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd);
- x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd);
- x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
- x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
- x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd);
- x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd);
- x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd);
- x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd);
- x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd);
- x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd);
-
- // stage 2
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = x4;
- s5 = x5;
- s6 = x6;
- s7 = x7;
- s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
- s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
- s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
- s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
- s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
- s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
- s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
- s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
- x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
- x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
- x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
- x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
- x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
- x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
- x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
- x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
- x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd);
- x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd);
- x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd);
- x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd);
- x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd);
- x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd);
- x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd);
- x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd);
-
- // stage 3
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
- s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
- s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
- s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
- s8 = x8;
- s9 = x9;
- s10 = x10;
- s11 = x11;
- s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
- s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
- s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
- s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
- x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
- x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
- x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
- x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
- x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
- x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
- x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
- x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
- x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
- x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
- x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd);
- x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd);
- x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd);
- x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
-
- // stage 4
- s2 = (-cospi_16_64) * (x2 + x3);
- s3 = cospi_16_64 * (x2 - x3);
- s6 = cospi_16_64 * (x6 + x7);
- s7 = cospi_16_64 * (-x6 + x7);
- s10 = cospi_16_64 * (x10 + x11);
- s11 = cospi_16_64 * (-x10 + x11);
- s14 = (-cospi_16_64) * (x14 + x15);
- s15 = cospi_16_64 * (x14 - x15);
-
- x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
- x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
- x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd);
- x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd);
- x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd);
- x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd);
-
- output[0] = HIGHBD_WRAPLOW(x0, bd);
- output[1] = HIGHBD_WRAPLOW(-x8, bd);
- output[2] = HIGHBD_WRAPLOW(x12, bd);
- output[3] = HIGHBD_WRAPLOW(-x4, bd);
- output[4] = HIGHBD_WRAPLOW(x6, bd);
- output[5] = HIGHBD_WRAPLOW(x14, bd);
- output[6] = HIGHBD_WRAPLOW(x10, bd);
- output[7] = HIGHBD_WRAPLOW(x2, bd);
- output[8] = HIGHBD_WRAPLOW(x3, bd);
- output[9] = HIGHBD_WRAPLOW(x11, bd);
- output[10] = HIGHBD_WRAPLOW(x15, bd);
- output[11] = HIGHBD_WRAPLOW(x7, bd);
- output[12] = HIGHBD_WRAPLOW(x5, bd);
- output[13] = HIGHBD_WRAPLOW(-x13, bd);
- output[14] = HIGHBD_WRAPLOW(x9, bd);
- output[15] = HIGHBD_WRAPLOW(-x1, bd);
-}
-
-void av1_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[16 * 16] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[16], temp_out[16];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // First transform rows. Since all non-zero dct coefficients are in
- // upper-left 4x4 area, we only need to calculate first 4 rows here.
- for (i = 0; i < 4; ++i) {
- av1_highbd_idct16_c(input, outptr, bd);
- input += 16;
- outptr += 16;
- }
-
- // Then transform columns.
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- av1_highbd_idct16_c(temp_in, temp_out, bd);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
-}
-
-void av1_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- int i, j;
- tran_high_t a1;
- tran_low_t out =
- HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
- a1 = ROUND_POWER_OF_TWO(out, 6);
- for (j = 0; j < 16; ++j) {
- for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
- dest += stride;
- }
-}
-
-static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
- int bd) {
- tran_low_t step1[32], step2[32];
- tran_high_t temp1, temp2;
- (void)bd;
-
- // stage 1
- step1[0] = input[0];
- step1[1] = input[16];
- step1[2] = input[8];
- step1[3] = input[24];
- step1[4] = input[4];
- step1[5] = input[20];
- step1[6] = input[12];
- step1[7] = input[28];
- step1[8] = input[2];
- step1[9] = input[18];
- step1[10] = input[10];
- step1[11] = input[26];
- step1[12] = input[6];
- step1[13] = input[22];
- step1[14] = input[14];
- step1[15] = input[30];
-
- temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
- temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
- step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
- temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
- step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
- temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
- step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
- temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
- step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
- temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
- step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
- temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
- step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
- temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
- step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
- temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
- step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- // stage 2
- step2[0] = step1[0];
- step2[1] = step1[1];
- step2[2] = step1[2];
- step2[3] = step1[3];
- step2[4] = step1[4];
- step2[5] = step1[5];
- step2[6] = step1[6];
- step2[7] = step1[7];
-
- temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
- temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
- temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
- temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
- temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
- step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
- step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
- step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
- step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
- step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
- step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
- step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
- step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
- step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
- step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
- step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
- step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
- step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
- step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
- step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
-
- // stage 3
- step1[0] = step2[0];
- step1[1] = step2[1];
- step1[2] = step2[2];
- step1[3] = step2[3];
-
- temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
- temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
- temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
- step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
- step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
- step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
- step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
- step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
- step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
- step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
-
- step1[16] = step2[16];
- step1[31] = step2[31];
- temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
- temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
- step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
- temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
- step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[19] = step2[19];
- step1[20] = step2[20];
- temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
- temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
- step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
- temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
- step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[23] = step2[23];
- step1[24] = step2[24];
- step1[27] = step2[27];
- step1[28] = step2[28];
-
- // stage 4
- temp1 = (step1[0] + step1[1]) * cospi_16_64;
- temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
- temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
- step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
- step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
- step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
-
- step2[8] = step1[8];
- step2[15] = step1[15];
- temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
- temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
- temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step2[11] = step1[11];
- step2[12] = step1[12];
-
- step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
- step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
- step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
- step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
- step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
- step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
- step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
- step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
-
- step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
- step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
- step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
- step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
- step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
- step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
- step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
- step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
-
- // stage 5
- step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
- step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
- step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
- step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
- step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[7] = step2[7];
-
- step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
- step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
- step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
- step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
- step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
- step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
- step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
- step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
-
- step1[16] = step2[16];
- step1[17] = step2[17];
- temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
- temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
- step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
- temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
- step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
- temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
- step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
- temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
- step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[22] = step2[22];
- step1[23] = step2[23];
- step1[24] = step2[24];
- step1[25] = step2[25];
- step1[30] = step2[30];
- step1[31] = step2[31];
-
- // stage 6
- step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
- step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
- step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
- step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
- step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
- step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
- step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
- step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
- step2[8] = step1[8];
- step2[9] = step1[9];
- temp1 = (-step1[10] + step1[13]) * cospi_16_64;
- temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = (-step1[11] + step1[12]) * cospi_16_64;
- temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step2[14] = step1[14];
- step2[15] = step1[15];
-
- step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
- step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
- step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
- step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
- step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
- step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
- step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
- step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
-
- step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
- step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
- step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
- step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
- step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
- step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
- step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
- step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
-
- // stage 7
- step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
- step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
- step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
- step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
- step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
- step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
- step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
- step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
- step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
- step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
- step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
- step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
- step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
- step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
- step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
- step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
-
- step1[16] = step2[16];
- step1[17] = step2[17];
- step1[18] = step2[18];
- step1[19] = step2[19];
- temp1 = (-step2[20] + step2[27]) * cospi_16_64;
- temp2 = (step2[20] + step2[27]) * cospi_16_64;
- step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = (-step2[21] + step2[26]) * cospi_16_64;
- temp2 = (step2[21] + step2[26]) * cospi_16_64;
- step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = (-step2[22] + step2[25]) * cospi_16_64;
- temp2 = (step2[22] + step2[25]) * cospi_16_64;
- step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = (-step2[23] + step2[24]) * cospi_16_64;
- temp2 = (step2[23] + step2[24]) * cospi_16_64;
- step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[28] = step2[28];
- step1[29] = step2[29];
- step1[30] = step2[30];
- step1[31] = step2[31];
-
- // final stage
- output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
- output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
- output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
- output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
- output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
- output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
- output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
- output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
- output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
- output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
- output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
- output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
- output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
- output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
- output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
- output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
- output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
- output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
- output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
- output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
- output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
- output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
- output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
- output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
- output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
- output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
- output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
- output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
- output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
- output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
- output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
- output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
-}
-
-void av1_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[32 * 32];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[32], temp_out[32];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // Rows
- for (i = 0; i < 32; ++i) {
- tran_low_t zero_coeff[16];
- for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
- for (j = 0; j < 8; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
- for (j = 0; j < 4; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
- for (j = 0; j < 2; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
- if (zero_coeff[0] | zero_coeff[1])
- highbd_idct32_c(input, outptr, bd);
- else
- memset(outptr, 0, sizeof(tran_low_t) * 32);
- input += 32;
- outptr += 32;
- }
-
- // Columns
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
- highbd_idct32_c(temp_in, temp_out, bd);
- for (j = 0; j < 32; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
-}
-
-void av1_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[32 * 32] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[32], temp_out[32];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // Rows
- // Only upper-left 8x8 has non-zero coeff.
- for (i = 0; i < 8; ++i) {
- highbd_idct32_c(input, outptr, bd);
- input += 32;
- outptr += 32;
- }
- // Columns
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
- highbd_idct32_c(temp_in, temp_out, bd);
- for (j = 0; j < 32; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
-}
-
-void av1_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- int i, j;
- int a1;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- tran_low_t out =
- HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
- out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
- a1 = ROUND_POWER_OF_TWO(out, 6);
-
- for (j = 0; j < 32; ++j) {
- for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
- dest += stride;
- }
-}
-#endif // CONFIG_AOM_HIGHBITDEPTH
diff --git a/av1/common/av1_inv_txfm.h b/av1/common/av1_inv_txfm.h
deleted file mode 100644
index c57e888..0000000
--- a/av1/common/av1_inv_txfm.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_INV_TXFM_H_
-#define AOM_DSP_INV_TXFM_H_
-
-#include <assert.h>
-
-#include "./aom_config.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/inv_txfm.h"
-#include "aom_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-static INLINE tran_high_t check_range(tran_high_t input) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
- // For valid input streams, intermediate stage coefficients should always
- // stay within the range of a signed 16 bit integer. Coefficients can go out
- // of this range for invalid/corrupt streams. However, strictly checking
- // this range for every intermediate coefficient can burdensome for a decoder,
- // therefore the following assertion is only enabled when configured with
- // --enable-coefficient-range-checking.
- assert(INT16_MIN <= input);
- assert(input <= INT16_MAX);
-#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
- return input;
-}
-
-static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
- tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
- return rv;
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
- // For valid highbitdepth streams, intermediate stage coefficients will
- // stay within the ranges:
- // - 8 bit: signed 16 bit integer
- // - 10 bit: signed 18 bit integer
- // - 12 bit: signed 20 bit integer
- const int32_t int_max = (1 << (7 + bd)) - 1;
- const int32_t int_min = -int_max - 1;
- assert(int_min <= input);
- assert(input <= int_max);
- (void)int_min;
-#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
- (void)bd;
- return input;
-}
-
-static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
- tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
- return rv;
-}
-#endif // CONFIG_AOM_HIGHBITDEPTH
-
-#if CONFIG_EMULATE_HARDWARE
-// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
-// non-normative method to handle overflows. A stream that causes
-// overflows in the inverse transform is considered invalid,
-// and a hardware implementer is free to choose any reasonable
-// method to handle overflows. However to aid in hardware
-// verification they can use a specific implementation of the
-// WRAPLOW() macro below that is identical to their intended
-// hardware implementation (and also use configure options to trigger
-// the C-implementation of the transform).
-//
-// The particular WRAPLOW implementation below performs strict
-// overflow wrapping to match common hardware implementations.
-// bd of 8 uses trans_low with 16bits, need to remove 16bits
-// bd of 10 uses trans_low with 18bits, need to remove 14bits
-// bd of 12 uses trans_low with 20bits, need to remove 12bits
-// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-
-#define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16)
-#if CONFIG_AOM_HIGHBITDEPTH
-#define HIGHBD_WRAPLOW(x, bd) \
- ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd))
-#endif // CONFIG_AOM_HIGHBITDEPTH
-
-#else // CONFIG_EMULATE_HARDWARE
-
-#define WRAPLOW(x) ((int32_t)check_range(x))
-#if CONFIG_AOM_HIGHBITDEPTH
-#define HIGHBD_WRAPLOW(x, bd) ((int32_t)highbd_check_range((x), bd))
-#endif // CONFIG_AOM_HIGHBITDEPTH
-
-#endif // CONFIG_EMULATE_HARDWARE
-
-void av1_idct4_c(const tran_low_t *input, tran_low_t *output);
-void av1_idct8_c(const tran_low_t *input, tran_low_t *output);
-void av1_idct16_c(const tran_low_t *input, tran_low_t *output);
-void av1_idct32_c(const tran_low_t *input, tran_low_t *output);
-void av1_iadst4_c(const tran_low_t *input, tran_low_t *output);
-void av1_iadst8_c(const tran_low_t *input, tran_low_t *output);
-void av1_iadst16_c(const tran_low_t *input, tran_low_t *output);
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void av1_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
-void av1_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
-void av1_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
-
-void av1_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
-void av1_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
-void av1_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
-
-static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
- int bd) {
- trans = HIGHBD_WRAPLOW(trans, bd);
- return clip_pixel_highbd(dest + (int)trans, bd);
-}
-#endif
-
-static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
- trans = WRAPLOW(trans);
- return clip_pixel(dest + (int)trans);
-}
-#ifdef __cplusplus
-} // extern "C"
-#endif
-#endif // AOM_DSP_INV_TXFM_H_
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 0ca93aa..e52dd04 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -110,7 +110,7 @@
specialize qw/av1_iht8x8_64_add sse2/;
add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
- specialize qw/av1_iht16x16_256_add sse2/;
+ specialize qw/av1_iht16x16_256_add sse2 avx2/;
}
} else {
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
@@ -167,7 +167,7 @@
specialize qw/av1_iht8x8_64_add sse2 neon dspr2/;
add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
- specialize qw/av1_iht16x16_256_add sse2 dspr2/;
+ specialize qw/av1_iht16x16_256_add sse2 avx2 dspr2/;
if (aom_config("CONFIG_EXT_TX") ne "yes") {
specialize qw/av1_iht4x4_16_add msa/;
@@ -404,62 +404,6 @@
specialize qw/av1_fht32x16 sse2/;
}
-if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct4x4/;
-
- add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct4x4_1/;
-
- add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct8x8/;
-
- add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct8x8_1/;
-
- add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct16x16/;
-
- add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct16x16_1/;
-
- add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct32x32/;
-
- add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct32x32_rd/;
-
- add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct32x32_1/;
-} else {
- add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct4x4 sse2/;
-
- add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct4x4_1 sse2/;
-
- add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct8x8 sse2/;
-
- add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct8x8_1 sse2/;
-
- add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct16x16 sse2/;
-
- add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct16x16_1 sse2/;
-
- add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct32x32 sse2/;
-
- add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct32x32_rd sse2/;
-
- add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct32x32_1 sse2/;
-}
-
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") ne "yes") {
if (aom_config("CONFIG_EXT_TX") ne "yes") {
specialize qw/av1_fht4x4 msa/;
@@ -468,243 +412,9 @@
}
}
-if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
- if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct4x4/;
-
- add_proto qw/void av1_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct8x8/;
-
- add_proto qw/void av1_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct8x8_1/;
-
- add_proto qw/void av1_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct16x16/;
-
- add_proto qw/void av1_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct16x16_1/;
-
- add_proto qw/void av1_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct32x32/;
-
- add_proto qw/void av1_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct32x32_rd/;
-
- add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct32x32_1/;
- } else {
- add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct4x4 sse2/;
-
- add_proto qw/void av1_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct8x8 sse2/;
-
- add_proto qw/void av1_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct8x8_1/;
-
- add_proto qw/void av1_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct16x16 sse2/;
-
- add_proto qw/void av1_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct16x16_1/;
-
- add_proto qw/void av1_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct32x32 sse2/;
-
- add_proto qw/void av1_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct32x32_rd sse2/;
-
- add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct32x32_1/;
- }
-}
-
add_proto qw/void av1_fwd_idtx/, "const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type";
specialize qw/av1_fwd_idtx/;
-# Inverse transform
-if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
- # Note as optimized versions of these functions are added we need to add a check to ensure
- # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
- add_proto qw/void av1_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct4x4_1_add/;
-
- add_proto qw/void av1_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct4x4_16_add/;
-
- add_proto qw/void av1_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct8x8_1_add/;
-
- add_proto qw/void av1_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct8x8_64_add/;
-
- add_proto qw/void av1_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct8x8_12_add/;
-
- add_proto qw/void av1_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct16x16_1_add/;
-
- add_proto qw/void av1_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct16x16_256_add/;
-
- add_proto qw/void av1_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct16x16_10_add/;
-
- add_proto qw/void av1_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct32x32_1024_add/;
-
- add_proto qw/void av1_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct32x32_34_add/;
-
- add_proto qw/void av1_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct32x32_1_add/;
-
- add_proto qw/void av1_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_iwht4x4_1_add/;
-
- add_proto qw/void av1_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_iwht4x4_16_add/;
-
- add_proto qw/void av1_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct4x4_1_add/;
-
- add_proto qw/void av1_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct8x8_1_add/;
-
- add_proto qw/void av1_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct16x16_1_add/;
-
- add_proto qw/void av1_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct32x32_1024_add/;
-
- add_proto qw/void av1_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct32x32_34_add/;
-
- add_proto qw/void av1_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct32x32_1_add/;
-
- add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_iwht4x4_1_add/;
-
- add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_iwht4x4_16_add/;
-
- # Force C versions if CONFIG_EMULATE_HARDWARE is 1
- if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void av1_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct4x4_16_add/;
-
- add_proto qw/void av1_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct8x8_64_add/;
-
- add_proto qw/void av1_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct8x8_10_add/;
-
- add_proto qw/void av1_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct16x16_256_add/;
-
- add_proto qw/void av1_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct16x16_10_add/;
- } else {
- add_proto qw/void av1_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct4x4_16_add sse2/;
-
- add_proto qw/void av1_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct8x8_64_add sse2/;
-
- add_proto qw/void av1_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct8x8_10_add sse2/;
-
- add_proto qw/void av1_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct16x16_256_add sse2/;
-
- add_proto qw/void av1_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct16x16_10_add sse2/;
- } # CONFIG_EMULATE_HARDWARE
-} else {
- # Force C versions if CONFIG_EMULATE_HARDWARE is 1
- if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void av1_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct4x4_1_add/;
-
- add_proto qw/void av1_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct4x4_16_add/;
-
- add_proto qw/void av1_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct8x8_1_add/;
-
- add_proto qw/void av1_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct8x8_64_add/;
-
- add_proto qw/void av1_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct8x8_12_add/;
-
- add_proto qw/void av1_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct16x16_1_add/;
-
- add_proto qw/void av1_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct16x16_256_add/;
-
- add_proto qw/void av1_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct16x16_10_add/;
-
- add_proto qw/void av1_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct32x32_1024_add/;
-
- add_proto qw/void av1_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct32x32_34_add/;
-
- add_proto qw/void av1_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct32x32_1_add/;
-
- add_proto qw/void av1_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_iwht4x4_1_add/;
-
- add_proto qw/void av1_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_iwht4x4_16_add/;
- } else {
- add_proto qw/void av1_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct4x4_1_add sse2/;
-
- add_proto qw/void av1_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct4x4_16_add sse2/;
-
- add_proto qw/void av1_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct8x8_1_add sse2/;
-
- add_proto qw/void av1_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct8x8_64_add sse2/;
-
- add_proto qw/void av1_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct8x8_12_add sse2/;
-
- add_proto qw/void av1_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct16x16_1_add sse2/;
-
- add_proto qw/void av1_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct16x16_256_add sse2/;
-
- add_proto qw/void av1_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct16x16_10_add sse2/;
-
- add_proto qw/void av1_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct32x32_1024_add sse2/;
-
- add_proto qw/void av1_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct32x32_34_add sse2/;
-
- add_proto qw/void av1_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct32x32_1_add sse2/;
-
- add_proto qw/void av1_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_iwht4x4_1_add/;
-
- add_proto qw/void av1_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_iwht4x4_16_add/;
- } # CONFIG_EMULATE_HARDWARE
-} # CONFIG_AOM_HIGHBITDEPTH
-
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
#fwd txfm
add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
diff --git a/av1/common/entropymode.c b/av1/common/entropymode.c
index cea5769..910ba0f 100644
--- a/av1/common/entropymode.c
+++ b/av1/common/entropymode.c
@@ -561,283 +561,348 @@
-PALETTE_COLOR_SEVEN, -PALETTE_COLOR_EIGHT },
};
-const aom_prob
- av1_default_palette_y_color_prob[PALETTE_MAX_SIZE - 1]
- [PALETTE_COLOR_CONTEXTS]
- [PALETTE_COLORS - 1] = {
- {
- // 2 colors
- { 230, 0, 0, 0, 0, 0, 0 },
- { 214, 0, 0, 0, 0, 0, 0 },
- { 0, 0, 0, 0, 0, 0, 0 },
- { 0, 0, 0, 0, 0, 0, 0 },
- { 0, 0, 0, 0, 0, 0, 0 },
- { 240, 0, 0, 0, 0, 0, 0 },
- { 73, 0, 0, 0, 0, 0, 0 },
- { 0, 0, 0, 0, 0, 0, 0 },
- { 130, 0, 0, 0, 0, 0, 0 },
- { 227, 0, 0, 0, 0, 0, 0 },
- { 0, 0, 0, 0, 0, 0, 0 },
- { 188, 0, 0, 0, 0, 0, 0 },
- { 75, 0, 0, 0, 0, 0, 0 },
- { 250, 0, 0, 0, 0, 0, 0 },
- { 223, 0, 0, 0, 0, 0, 0 },
- { 252, 0, 0, 0, 0, 0, 0 },
- },
- {
- // 3 colors
- { 229, 137, 0, 0, 0, 0, 0 },
- { 197, 120, 0, 0, 0, 0, 0 },
- { 107, 195, 0, 0, 0, 0, 0 },
- { 0, 0, 0, 0, 0, 0, 0 },
- { 27, 151, 0, 0, 0, 0, 0 },
- { 230, 130, 0, 0, 0, 0, 0 },
- { 37, 230, 0, 0, 0, 0, 0 },
- { 67, 221, 0, 0, 0, 0, 0 },
- { 124, 230, 0, 0, 0, 0, 0 },
- { 195, 109, 0, 0, 0, 0, 0 },
- { 99, 122, 0, 0, 0, 0, 0 },
- { 205, 208, 0, 0, 0, 0, 0 },
- { 40, 235, 0, 0, 0, 0, 0 },
- { 251, 132, 0, 0, 0, 0, 0 },
- { 237, 186, 0, 0, 0, 0, 0 },
- { 253, 112, 0, 0, 0, 0, 0 },
- },
- {
- // 4 colors
- { 195, 87, 128, 0, 0, 0, 0 },
- { 143, 100, 123, 0, 0, 0, 0 },
- { 94, 124, 119, 0, 0, 0, 0 },
- { 77, 91, 130, 0, 0, 0, 0 },
- { 39, 114, 178, 0, 0, 0, 0 },
- { 222, 94, 125, 0, 0, 0, 0 },
- { 44, 203, 132, 0, 0, 0, 0 },
- { 68, 175, 122, 0, 0, 0, 0 },
- { 110, 187, 124, 0, 0, 0, 0 },
- { 152, 91, 128, 0, 0, 0, 0 },
- { 70, 109, 181, 0, 0, 0, 0 },
- { 133, 113, 164, 0, 0, 0, 0 },
- { 47, 205, 133, 0, 0, 0, 0 },
- { 247, 94, 136, 0, 0, 0, 0 },
- { 205, 122, 146, 0, 0, 0, 0 },
- { 251, 100, 141, 0, 0, 0, 0 },
- },
- {
- // 5 colors
- { 195, 65, 84, 125, 0, 0, 0 },
- { 150, 76, 84, 121, 0, 0, 0 },
- { 94, 110, 81, 117, 0, 0, 0 },
- { 79, 85, 91, 139, 0, 0, 0 },
- { 26, 102, 139, 127, 0, 0, 0 },
- { 220, 73, 91, 119, 0, 0, 0 },
- { 38, 203, 86, 127, 0, 0, 0 },
- { 61, 186, 72, 124, 0, 0, 0 },
- { 132, 199, 84, 128, 0, 0, 0 },
- { 172, 52, 62, 120, 0, 0, 0 },
- { 102, 89, 121, 122, 0, 0, 0 },
- { 182, 48, 69, 186, 0, 0, 0 },
- { 36, 206, 87, 126, 0, 0, 0 },
- { 249, 55, 67, 122, 0, 0, 0 },
- { 218, 88, 75, 122, 0, 0, 0 },
- { 253, 64, 80, 119, 0, 0, 0 },
- },
- {
- // 6 colors
- { 182, 54, 64, 75, 118, 0, 0 },
- { 126, 67, 70, 76, 116, 0, 0 },
- { 79, 92, 67, 85, 120, 0, 0 },
- { 63, 61, 81, 118, 132, 0, 0 },
- { 21, 80, 105, 83, 119, 0, 0 },
- { 215, 72, 74, 74, 111, 0, 0 },
- { 50, 176, 63, 79, 120, 0, 0 },
- { 72, 148, 66, 77, 120, 0, 0 },
- { 105, 177, 57, 78, 130, 0, 0 },
- { 150, 66, 66, 80, 127, 0, 0 },
- { 81, 76, 109, 85, 116, 0, 0 },
- { 113, 81, 62, 96, 148, 0, 0 },
- { 54, 179, 69, 82, 121, 0, 0 },
- { 244, 47, 48, 67, 118, 0, 0 },
- { 198, 83, 53, 65, 121, 0, 0 },
- { 250, 42, 51, 69, 110, 0, 0 },
- },
- {
- // 7 colors
- { 182, 45, 54, 62, 74, 113, 0 },
- { 124, 63, 57, 62, 77, 114, 0 },
- { 77, 80, 56, 66, 76, 117, 0 },
- { 63, 57, 69, 98, 85, 131, 0 },
- { 19, 81, 98, 63, 80, 116, 0 },
- { 215, 56, 60, 63, 68, 105, 0 },
- { 50, 174, 50, 60, 79, 118, 0 },
- { 68, 151, 50, 58, 73, 117, 0 },
- { 104, 182, 53, 57, 79, 127, 0 },
- { 156, 50, 51, 63, 77, 111, 0 },
- { 88, 67, 97, 59, 82, 120, 0 },
- { 114, 81, 46, 65, 103, 132, 0 },
- { 55, 166, 57, 66, 82, 120, 0 },
- { 245, 34, 38, 43, 63, 114, 0 },
- { 203, 68, 45, 47, 60, 118, 0 },
- { 250, 35, 37, 47, 66, 110, 0 },
- },
- {
- // 8 colors
- { 180, 43, 46, 50, 56, 69, 109 },
- { 116, 53, 51, 49, 57, 73, 115 },
- { 79, 70, 49, 50, 59, 74, 117 },
- { 60, 54, 57, 70, 62, 83, 129 },
- { 20, 73, 85, 52, 66, 81, 119 },
- { 213, 56, 52, 49, 53, 62, 104 },
- { 48, 161, 41, 45, 56, 77, 116 },
- { 68, 139, 40, 47, 54, 71, 116 },
- { 123, 166, 42, 43, 52, 76, 130 },
- { 153, 44, 44, 47, 54, 79, 129 },
- { 87, 64, 83, 49, 60, 75, 127 },
- { 131, 68, 43, 48, 73, 96, 130 },
- { 55, 152, 45, 51, 64, 77, 113 },
- { 243, 30, 28, 33, 41, 65, 114 },
- { 202, 56, 35, 36, 42, 63, 123 },
- { 249, 31, 29, 32, 45, 68, 111 },
- }
- };
+// Note: Has to be non-zero to avoid any asserts triggering.
+#define UNUSED_PROB 128
-const aom_prob
- av1_default_palette_uv_color_prob[PALETTE_MAX_SIZE - 1]
- [PALETTE_COLOR_CONTEXTS]
- [PALETTE_COLORS - 1] = {
- {
- // 2 colors
- { 228, 0, 0, 0, 0, 0, 0 },
- { 195, 0, 0, 0, 0, 0, 0 },
- { 0, 0, 0, 0, 0, 0, 0 },
- { 0, 0, 0, 0, 0, 0, 0 },
- { 0, 0, 0, 0, 0, 0, 0 },
- { 228, 0, 0, 0, 0, 0, 0 },
- { 71, 0, 0, 0, 0, 0, 0 },
- { 0, 0, 0, 0, 0, 0, 0 },
- { 129, 0, 0, 0, 0, 0, 0 },
- { 206, 0, 0, 0, 0, 0, 0 },
- { 0, 0, 0, 0, 0, 0, 0 },
- { 136, 0, 0, 0, 0, 0, 0 },
- { 98, 0, 0, 0, 0, 0, 0 },
- { 236, 0, 0, 0, 0, 0, 0 },
- { 222, 0, 0, 0, 0, 0, 0 },
- { 249, 0, 0, 0, 0, 0, 0 },
- },
- {
- // 3 colors
- { 198, 136, 0, 0, 0, 0, 0 },
- { 178, 105, 0, 0, 0, 0, 0 },
- { 100, 206, 0, 0, 0, 0, 0 },
- { 0, 0, 0, 0, 0, 0, 0 },
- { 12, 136, 0, 0, 0, 0, 0 },
- { 219, 134, 0, 0, 0, 0, 0 },
- { 50, 198, 0, 0, 0, 0, 0 },
- { 61, 231, 0, 0, 0, 0, 0 },
- { 110, 209, 0, 0, 0, 0, 0 },
- { 173, 106, 0, 0, 0, 0, 0 },
- { 145, 166, 0, 0, 0, 0, 0 },
- { 156, 175, 0, 0, 0, 0, 0 },
- { 69, 183, 0, 0, 0, 0, 0 },
- { 241, 163, 0, 0, 0, 0, 0 },
- { 224, 160, 0, 0, 0, 0, 0 },
- { 246, 154, 0, 0, 0, 0, 0 },
- },
- {
- // 4 colors
- { 173, 88, 143, 0, 0, 0, 0 },
- { 146, 81, 127, 0, 0, 0, 0 },
- { 84, 134, 102, 0, 0, 0, 0 },
- { 69, 138, 140, 0, 0, 0, 0 },
- { 31, 103, 200, 0, 0, 0, 0 },
- { 217, 101, 139, 0, 0, 0, 0 },
- { 51, 174, 121, 0, 0, 0, 0 },
- { 64, 177, 109, 0, 0, 0, 0 },
- { 96, 179, 145, 0, 0, 0, 0 },
- { 164, 77, 114, 0, 0, 0, 0 },
- { 87, 94, 156, 0, 0, 0, 0 },
- { 105, 57, 173, 0, 0, 0, 0 },
- { 63, 158, 137, 0, 0, 0, 0 },
- { 236, 102, 156, 0, 0, 0, 0 },
- { 197, 115, 153, 0, 0, 0, 0 },
- { 245, 106, 154, 0, 0, 0, 0 },
- },
- {
- // 5 colors
- { 179, 64, 97, 129, 0, 0, 0 },
- { 137, 56, 88, 125, 0, 0, 0 },
- { 82, 107, 61, 118, 0, 0, 0 },
- { 59, 113, 86, 115, 0, 0, 0 },
- { 23, 88, 118, 130, 0, 0, 0 },
- { 213, 66, 90, 125, 0, 0, 0 },
- { 37, 181, 103, 121, 0, 0, 0 },
- { 47, 188, 61, 131, 0, 0, 0 },
- { 104, 185, 103, 144, 0, 0, 0 },
- { 163, 39, 76, 112, 0, 0, 0 },
- { 94, 74, 131, 126, 0, 0, 0 },
- { 142, 42, 103, 163, 0, 0, 0 },
- { 53, 162, 99, 149, 0, 0, 0 },
- { 239, 54, 84, 108, 0, 0, 0 },
- { 203, 84, 110, 147, 0, 0, 0 },
- { 248, 70, 105, 151, 0, 0, 0 },
- },
- {
- // 6 colors
- { 189, 50, 67, 90, 130, 0, 0 },
- { 114, 50, 55, 90, 123, 0, 0 },
- { 66, 76, 54, 82, 128, 0, 0 },
- { 43, 69, 69, 80, 129, 0, 0 },
- { 22, 59, 87, 88, 141, 0, 0 },
- { 203, 49, 68, 87, 122, 0, 0 },
- { 43, 157, 74, 104, 146, 0, 0 },
- { 54, 138, 51, 95, 138, 0, 0 },
- { 82, 171, 58, 102, 146, 0, 0 },
- { 129, 38, 59, 64, 168, 0, 0 },
- { 56, 67, 119, 92, 112, 0, 0 },
- { 96, 62, 53, 132, 82, 0, 0 },
- { 60, 147, 77, 108, 145, 0, 0 },
- { 238, 76, 73, 93, 148, 0, 0 },
- { 189, 86, 73, 103, 157, 0, 0 },
- { 246, 62, 75, 83, 167, 0, 0 },
- },
- {
- // 7 colors
- { 179, 42, 51, 73, 99, 134, 0 },
- { 119, 52, 52, 61, 64, 114, 0 },
- { 53, 77, 35, 65, 71, 131, 0 },
- { 38, 70, 51, 68, 89, 144, 0 },
- { 23, 65, 128, 73, 97, 131, 0 },
- { 210, 47, 52, 63, 81, 143, 0 },
- { 42, 159, 57, 68, 98, 143, 0 },
- { 49, 153, 45, 82, 93, 143, 0 },
- { 81, 169, 52, 72, 113, 151, 0 },
- { 136, 46, 35, 56, 75, 96, 0 },
- { 57, 84, 109, 47, 107, 131, 0 },
- { 128, 78, 57, 36, 128, 85, 0 },
- { 54, 149, 68, 77, 94, 153, 0 },
- { 243, 58, 50, 71, 81, 167, 0 },
- { 189, 92, 64, 70, 121, 173, 0 },
- { 248, 35, 38, 51, 82, 201, 0 },
- },
- {
- // 8 colors
- { 201, 40, 36, 42, 64, 92, 123 },
- { 116, 43, 33, 43, 73, 102, 128 },
- { 46, 77, 37, 69, 62, 78, 150 },
- { 40, 65, 52, 50, 76, 89, 133 },
- { 28, 48, 91, 17, 64, 77, 133 },
- { 218, 43, 43, 37, 56, 72, 163 },
- { 41, 155, 44, 83, 82, 129, 180 },
- { 44, 141, 29, 55, 64, 89, 147 },
- { 92, 166, 48, 45, 59, 126, 179 },
- { 169, 35, 49, 41, 36, 99, 139 },
- { 55, 77, 77, 56, 60, 75, 156 },
- { 155, 81, 51, 64, 57, 182, 255 },
- { 60, 134, 49, 49, 93, 128, 174 },
- { 244, 98, 51, 46, 22, 73, 238 },
- { 189, 70, 40, 87, 93, 79, 201 },
- { 248, 54, 49, 40, 29, 42, 227 },
- }
- };
+const aom_prob av1_default_palette_y_color_prob
+ [PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] = {
+ {
+ // 2 colors
+ { 230, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 214, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 240, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 73, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 130, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 227, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 188, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 75, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 250, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 223, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 252, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ },
+ {
+ // 3 colors
+ { 229, 137, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 197, 120, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 107, 195, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 27, 151, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 230, 130, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 37, 230, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 67, 221, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 124, 230, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 195, 109, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 99, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 205, 208, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 40, 235, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 251, 132, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 237, 186, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 253, 112, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ },
+ {
+ // 4 colors
+ { 195, 87, 128, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 143, 100, 123, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 94, 124, 119, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 77, 91, 130, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 39, 114, 178, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 222, 94, 125, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 44, 203, 132, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 68, 175, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 110, 187, 124, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 152, 91, 128, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 70, 109, 181, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 133, 113, 164, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 47, 205, 133, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 247, 94, 136, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 205, 122, 146, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 251, 100, 141, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ },
+ {
+ // 5 colors
+ { 195, 65, 84, 125, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 150, 76, 84, 121, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 94, 110, 81, 117, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 79, 85, 91, 139, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 26, 102, 139, 127, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 220, 73, 91, 119, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 38, 203, 86, 127, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 61, 186, 72, 124, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 132, 199, 84, 128, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 172, 52, 62, 120, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 102, 89, 121, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 182, 48, 69, 186, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 36, 206, 87, 126, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 249, 55, 67, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 218, 88, 75, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 253, 64, 80, 119, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ },
+ {
+ // 6 colors
+ { 182, 54, 64, 75, 118, UNUSED_PROB, UNUSED_PROB },
+ { 126, 67, 70, 76, 116, UNUSED_PROB, UNUSED_PROB },
+ { 79, 92, 67, 85, 120, UNUSED_PROB, UNUSED_PROB },
+ { 63, 61, 81, 118, 132, UNUSED_PROB, UNUSED_PROB },
+ { 21, 80, 105, 83, 119, UNUSED_PROB, UNUSED_PROB },
+ { 215, 72, 74, 74, 111, UNUSED_PROB, UNUSED_PROB },
+ { 50, 176, 63, 79, 120, UNUSED_PROB, UNUSED_PROB },
+ { 72, 148, 66, 77, 120, UNUSED_PROB, UNUSED_PROB },
+ { 105, 177, 57, 78, 130, UNUSED_PROB, UNUSED_PROB },
+ { 150, 66, 66, 80, 127, UNUSED_PROB, UNUSED_PROB },
+ { 81, 76, 109, 85, 116, UNUSED_PROB, UNUSED_PROB },
+ { 113, 81, 62, 96, 148, UNUSED_PROB, UNUSED_PROB },
+ { 54, 179, 69, 82, 121, UNUSED_PROB, UNUSED_PROB },
+ { 244, 47, 48, 67, 118, UNUSED_PROB, UNUSED_PROB },
+ { 198, 83, 53, 65, 121, UNUSED_PROB, UNUSED_PROB },
+ { 250, 42, 51, 69, 110, UNUSED_PROB, UNUSED_PROB },
+ },
+ {
+ // 7 colors
+ { 182, 45, 54, 62, 74, 113, UNUSED_PROB },
+ { 124, 63, 57, 62, 77, 114, UNUSED_PROB },
+ { 77, 80, 56, 66, 76, 117, UNUSED_PROB },
+ { 63, 57, 69, 98, 85, 131, UNUSED_PROB },
+ { 19, 81, 98, 63, 80, 116, UNUSED_PROB },
+ { 215, 56, 60, 63, 68, 105, UNUSED_PROB },
+ { 50, 174, 50, 60, 79, 118, UNUSED_PROB },
+ { 68, 151, 50, 58, 73, 117, UNUSED_PROB },
+ { 104, 182, 53, 57, 79, 127, UNUSED_PROB },
+ { 156, 50, 51, 63, 77, 111, UNUSED_PROB },
+ { 88, 67, 97, 59, 82, 120, UNUSED_PROB },
+ { 114, 81, 46, 65, 103, 132, UNUSED_PROB },
+ { 55, 166, 57, 66, 82, 120, UNUSED_PROB },
+ { 245, 34, 38, 43, 63, 114, UNUSED_PROB },
+ { 203, 68, 45, 47, 60, 118, UNUSED_PROB },
+ { 250, 35, 37, 47, 66, 110, UNUSED_PROB },
+ },
+ {
+ // 8 colors
+ { 180, 43, 46, 50, 56, 69, 109 },
+ { 116, 53, 51, 49, 57, 73, 115 },
+ { 79, 70, 49, 50, 59, 74, 117 },
+ { 60, 54, 57, 70, 62, 83, 129 },
+ { 20, 73, 85, 52, 66, 81, 119 },
+ { 213, 56, 52, 49, 53, 62, 104 },
+ { 48, 161, 41, 45, 56, 77, 116 },
+ { 68, 139, 40, 47, 54, 71, 116 },
+ { 123, 166, 42, 43, 52, 76, 130 },
+ { 153, 44, 44, 47, 54, 79, 129 },
+ { 87, 64, 83, 49, 60, 75, 127 },
+ { 131, 68, 43, 48, 73, 96, 130 },
+ { 55, 152, 45, 51, 64, 77, 113 },
+ { 243, 30, 28, 33, 41, 65, 114 },
+ { 202, 56, 35, 36, 42, 63, 123 },
+ { 249, 31, 29, 32, 45, 68, 111 },
+ }
+ };
+
+const aom_prob av1_default_palette_uv_color_prob
+ [PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] = {
+ {
+ // 2 colors
+ { 228, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 195, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 228, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 71, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 129, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 206, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 136, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 98, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 236, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 222, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 249, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ },
+ {
+ // 3 colors
+ { 198, 136, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 178, 105, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 100, 206, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 12, 136, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 219, 134, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 50, 198, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 61, 231, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 110, 209, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 173, 106, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 145, 166, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 156, 175, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 69, 183, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 241, 163, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 224, 160, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 246, 154, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ },
+ {
+ // 4 colors
+ { 173, 88, 143, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 146, 81, 127, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 84, 134, 102, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 69, 138, 140, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 31, 103, 200, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 217, 101, 139, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 51, 174, 121, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 64, 177, 109, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 96, 179, 145, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 164, 77, 114, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 87, 94, 156, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 105, 57, 173, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 63, 158, 137, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 236, 102, 156, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 197, 115, 153, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 245, 106, 154, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ },
+ {
+ // 5 colors
+ { 179, 64, 97, 129, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 137, 56, 88, 125, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 82, 107, 61, 118, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 59, 113, 86, 115, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 23, 88, 118, 130, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 213, 66, 90, 125, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 37, 181, 103, 121, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 47, 188, 61, 131, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 104, 185, 103, 144, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 163, 39, 76, 112, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 94, 74, 131, 126, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 142, 42, 103, 163, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 53, 162, 99, 149, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 239, 54, 84, 108, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 203, 84, 110, 147, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 248, 70, 105, 151, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ },
+ {
+ // 6 colors
+ { 189, 50, 67, 90, 130, UNUSED_PROB, UNUSED_PROB },
+ { 114, 50, 55, 90, 123, UNUSED_PROB, UNUSED_PROB },
+ { 66, 76, 54, 82, 128, UNUSED_PROB, UNUSED_PROB },
+ { 43, 69, 69, 80, 129, UNUSED_PROB, UNUSED_PROB },
+ { 22, 59, 87, 88, 141, UNUSED_PROB, UNUSED_PROB },
+ { 203, 49, 68, 87, 122, UNUSED_PROB, UNUSED_PROB },
+ { 43, 157, 74, 104, 146, UNUSED_PROB, UNUSED_PROB },
+ { 54, 138, 51, 95, 138, UNUSED_PROB, UNUSED_PROB },
+ { 82, 171, 58, 102, 146, UNUSED_PROB, UNUSED_PROB },
+ { 129, 38, 59, 64, 168, UNUSED_PROB, UNUSED_PROB },
+ { 56, 67, 119, 92, 112, UNUSED_PROB, UNUSED_PROB },
+ { 96, 62, 53, 132, 82, UNUSED_PROB, UNUSED_PROB },
+ { 60, 147, 77, 108, 145, UNUSED_PROB, UNUSED_PROB },
+ { 238, 76, 73, 93, 148, UNUSED_PROB, UNUSED_PROB },
+ { 189, 86, 73, 103, 157, UNUSED_PROB, UNUSED_PROB },
+ { 246, 62, 75, 83, 167, UNUSED_PROB, UNUSED_PROB },
+ },
+ {
+ // 7 colors
+ { 179, 42, 51, 73, 99, 134, UNUSED_PROB },
+ { 119, 52, 52, 61, 64, 114, UNUSED_PROB },
+ { 53, 77, 35, 65, 71, 131, UNUSED_PROB },
+ { 38, 70, 51, 68, 89, 144, UNUSED_PROB },
+ { 23, 65, 128, 73, 97, 131, UNUSED_PROB },
+ { 210, 47, 52, 63, 81, 143, UNUSED_PROB },
+ { 42, 159, 57, 68, 98, 143, UNUSED_PROB },
+ { 49, 153, 45, 82, 93, 143, UNUSED_PROB },
+ { 81, 169, 52, 72, 113, 151, UNUSED_PROB },
+ { 136, 46, 35, 56, 75, 96, UNUSED_PROB },
+ { 57, 84, 109, 47, 107, 131, UNUSED_PROB },
+ { 128, 78, 57, 36, 128, 85, UNUSED_PROB },
+ { 54, 149, 68, 77, 94, 153, UNUSED_PROB },
+ { 243, 58, 50, 71, 81, 167, UNUSED_PROB },
+ { 189, 92, 64, 70, 121, 173, UNUSED_PROB },
+ { 248, 35, 38, 51, 82, 201, UNUSED_PROB },
+ },
+ {
+ // 8 colors
+ { 201, 40, 36, 42, 64, 92, 123 },
+ { 116, 43, 33, 43, 73, 102, 128 },
+ { 46, 77, 37, 69, 62, 78, 150 },
+ { 40, 65, 52, 50, 76, 89, 133 },
+ { 28, 48, 91, 17, 64, 77, 133 },
+ { 218, 43, 43, 37, 56, 72, 163 },
+ { 41, 155, 44, 83, 82, 129, 180 },
+ { 44, 141, 29, 55, 64, 89, 147 },
+ { 92, 166, 48, 45, 59, 126, 179 },
+ { 169, 35, 49, 41, 36, 99, 139 },
+ { 55, 77, 77, 56, 60, 75, 156 },
+ { 155, 81, 51, 64, 57, 182, 255 },
+ { 60, 134, 49, 49, 93, 128, 174 },
+ { 244, 98, 51, 46, 22, 73, 238 },
+ { 189, 70, 40, 87, 93, 79, 201 },
+ { 248, 54, 49, 40, 29, 42, 227 },
+ }
+ };
+
+#undef UNUSED_PROB
static const int palette_color_context_lookup[PALETTE_COLOR_CONTEXTS] = {
// (3, 0, 0, 0), (3, 2, 0, 0), (3, 3, 2, 0), (3, 3, 2, 2),
@@ -1463,8 +1528,8 @@
}
for (i = 0; i < INTRA_MODES; ++i)
- av1_tree_to_cdf(av1_intra_mode_tree, fc->uv_mode_prob[j],
- fc->uv_mode_cdf[j]);
+ av1_tree_to_cdf(av1_intra_mode_tree, fc->uv_mode_prob[i],
+ fc->uv_mode_cdf[i]);
for (i = 0; i < PARTITION_CONTEXTS; ++i)
av1_tree_to_cdf(av1_partition_tree, fc->partition_prob[i],
diff --git a/av1/common/idct.c b/av1/common/idct.c
index 5a95b47..cc20858 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -1012,17 +1012,12 @@
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
- av1_iht16x16_256_add(input, dest, stride, tx_type);
- break;
case V_DCT:
case H_DCT:
case V_ADST:
case H_ADST:
case V_FLIPADST:
- case H_FLIPADST:
- // Use C version since DST only exists in C code
- av1_iht16x16_256_add_c(input, dest, stride, tx_type);
- break;
+ case H_FLIPADST: av1_iht16x16_256_add(input, dest, stride, tx_type); break;
case IDTX: inv_idtx_add_c(input, dest, stride, 16, tx_type); break;
#endif // CONFIG_EXT_TX
default: assert(0); break;
diff --git a/av1/common/quant_common.c b/av1/common/quant_common.c
index b3228b7..69d0cc0 100644
--- a/av1/common/quant_common.c
+++ b/av1/common/quant_common.c
@@ -16,11 +16,6 @@
#include "av1/common/seg_common.h"
#include "av1/common/blockd.h"
-#if CONFIG_AOM_QM
-static void make_qmatrices(qm_val_t *wmatrix[NUM_QM_LEVELS][2][2][TX_SIZES],
- qm_val_t *iwmatrix[NUM_QM_LEVELS][2][2][TX_SIZES]);
-#endif
-
#if CONFIG_NEW_QUANT
// Bin widths expressed as a fraction over 128 of the quant stepsize,
// for the quantization bins 0-4.
diff --git a/av1/common/scan.c b/av1/common/scan.c
index 81b6de1..b2386b9 100644
--- a/av1/common/scan.c
+++ b/av1/common/scan.c
@@ -4429,8 +4429,7 @@
int16_t *scan = get_adapt_scan(cm->fc, tx_size, tx_type);
int16_t *iscan = get_adapt_iscan(cm->fc, tx_size, tx_type);
int16_t *nb = get_adapt_nb(cm->fc, tx_size, tx_type);
- const int tx2d_size = tx_size_2d[tx_size];
- assert(tx2d_size <= 1024);
+ assert(tx_size_2d[tx_size] <= 1024);
av1_update_sort_order(tx_size, non_zero_prob, sort_order);
av1_update_scan_order(tx_size, sort_order, scan, iscan);
av1_update_neighbors(tx_size, scan, iscan, nb);
diff --git a/av1/common/x86/av1_fwd_dct32x32_impl_sse2.h b/av1/common/x86/av1_fwd_dct32x32_impl_sse2.h
deleted file mode 100644
index 876e579..0000000
--- a/av1/common/x86/av1_fwd_dct32x32_impl_sse2.h
+++ /dev/null
@@ -1,3202 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h> // SSE2
-
-#include "./av1_rtcd.h"
-#include "av1/common/av1_fwd_txfm.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-// TODO(jingning) The high bit-depth version needs re-work for performance.
-// The current SSE2 implementation also causes cross reference to the static
-// functions in the C implementation file.
-#if DCT_HIGH_BIT_DEPTH
-#define ADD_EPI16 _mm_adds_epi16
-#define SUB_EPI16 _mm_subs_epi16
-#if FDCT32x32_HIGH_PRECISION
-void av1_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
- int i, j;
- for (i = 0; i < 32; ++i) {
- tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
- av1_fdct32(temp_in, temp_out, 0);
- for (j = 0; j < 32; ++j)
- out[j + i * 32] =
- (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
- }
-}
-#define HIGH_FDCT32x32_2D_C av1_highbd_fdct32x32_c
-#define HIGH_FDCT32x32_2D_ROWS_C av1_fdct32x32_rows_c
-#else
-void av1_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
- int i, j;
- for (i = 0; i < 32; ++i) {
- tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
- av1_fdct32(temp_in, temp_out, 1);
- for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
- }
-}
-#define HIGH_FDCT32x32_2D_C av1_highbd_fdct32x32_rd_c
-#define HIGH_FDCT32x32_2D_ROWS_C av1_fdct32x32_rd_rows_c
-#endif // FDCT32x32_HIGH_PRECISION
-#else
-#define ADD_EPI16 _mm_add_epi16
-#define SUB_EPI16 _mm_sub_epi16
-#endif // DCT_HIGH_BIT_DEPTH
-
-void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
- // Calculate pre-multiplied strides
- const int str1 = stride;
- const int str2 = 2 * stride;
- const int str3 = 2 * stride + str1;
- // We need an intermediate buffer between passes.
- DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
- // Constants
- // When we use them, in one case, they are all the same. In all others
- // it's a pair of them that we need to repeat four times. This is done
- // by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
- const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
- const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
- const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
- const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
- const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
- const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
- const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
- const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
- const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
- const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
- const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
- const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
- const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
- const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
- const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
- const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
- const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
- const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
- const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
- const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
- const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
- const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
- const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
- const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
- const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i kZero = _mm_set1_epi16(0);
- const __m128i kOne = _mm_set1_epi16(1);
- // Do the two transform/transpose passes
- int pass;
-#if DCT_HIGH_BIT_DEPTH
- int overflow;
-#endif
- for (pass = 0; pass < 2; ++pass) {
- // We process eight columns (transposed rows in second pass) at a time.
- int column_start;
- for (column_start = 0; column_start < 32; column_start += 8) {
- __m128i step1[32];
- __m128i step2[32];
- __m128i step3[32];
- __m128i out[32];
- // Stage 1
- // Note: even though all the loads below are aligned, using the aligned
- // intrinsic make the code slightly slower.
- if (0 == pass) {
- const int16_t *in = &input[column_start];
- // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
- // Note: the next four blocks could be in a loop. That would help the
- // instruction cache but is actually slower.
- {
- const int16_t *ina = in + 0 * str1;
- const int16_t *inb = in + 31 * str1;
- __m128i *step1a = &step1[0];
- __m128i *step1b = &step1[31];
- const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
- const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
- const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
- const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
- const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
- const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
- const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
- const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
- step1a[0] = _mm_add_epi16(ina0, inb0);
- step1a[1] = _mm_add_epi16(ina1, inb1);
- step1a[2] = _mm_add_epi16(ina2, inb2);
- step1a[3] = _mm_add_epi16(ina3, inb3);
- step1b[-3] = _mm_sub_epi16(ina3, inb3);
- step1b[-2] = _mm_sub_epi16(ina2, inb2);
- step1b[-1] = _mm_sub_epi16(ina1, inb1);
- step1b[-0] = _mm_sub_epi16(ina0, inb0);
- step1a[0] = _mm_slli_epi16(step1a[0], 2);
- step1a[1] = _mm_slli_epi16(step1a[1], 2);
- step1a[2] = _mm_slli_epi16(step1a[2], 2);
- step1a[3] = _mm_slli_epi16(step1a[3], 2);
- step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
- step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
- step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
- step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
- }
- {
- const int16_t *ina = in + 4 * str1;
- const int16_t *inb = in + 27 * str1;
- __m128i *step1a = &step1[4];
- __m128i *step1b = &step1[27];
- const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
- const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
- const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
- const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
- const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
- const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
- const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
- const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
- step1a[0] = _mm_add_epi16(ina0, inb0);
- step1a[1] = _mm_add_epi16(ina1, inb1);
- step1a[2] = _mm_add_epi16(ina2, inb2);
- step1a[3] = _mm_add_epi16(ina3, inb3);
- step1b[-3] = _mm_sub_epi16(ina3, inb3);
- step1b[-2] = _mm_sub_epi16(ina2, inb2);
- step1b[-1] = _mm_sub_epi16(ina1, inb1);
- step1b[-0] = _mm_sub_epi16(ina0, inb0);
- step1a[0] = _mm_slli_epi16(step1a[0], 2);
- step1a[1] = _mm_slli_epi16(step1a[1], 2);
- step1a[2] = _mm_slli_epi16(step1a[2], 2);
- step1a[3] = _mm_slli_epi16(step1a[3], 2);
- step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
- step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
- step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
- step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
- }
- {
- const int16_t *ina = in + 8 * str1;
- const int16_t *inb = in + 23 * str1;
- __m128i *step1a = &step1[8];
- __m128i *step1b = &step1[23];
- const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
- const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
- const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
- const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
- const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
- const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
- const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
- const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
- step1a[0] = _mm_add_epi16(ina0, inb0);
- step1a[1] = _mm_add_epi16(ina1, inb1);
- step1a[2] = _mm_add_epi16(ina2, inb2);
- step1a[3] = _mm_add_epi16(ina3, inb3);
- step1b[-3] = _mm_sub_epi16(ina3, inb3);
- step1b[-2] = _mm_sub_epi16(ina2, inb2);
- step1b[-1] = _mm_sub_epi16(ina1, inb1);
- step1b[-0] = _mm_sub_epi16(ina0, inb0);
- step1a[0] = _mm_slli_epi16(step1a[0], 2);
- step1a[1] = _mm_slli_epi16(step1a[1], 2);
- step1a[2] = _mm_slli_epi16(step1a[2], 2);
- step1a[3] = _mm_slli_epi16(step1a[3], 2);
- step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
- step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
- step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
- step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
- }
- {
- const int16_t *ina = in + 12 * str1;
- const int16_t *inb = in + 19 * str1;
- __m128i *step1a = &step1[12];
- __m128i *step1b = &step1[19];
- const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
- const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
- const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
- const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
- const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
- const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
- const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
- const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
- step1a[0] = _mm_add_epi16(ina0, inb0);
- step1a[1] = _mm_add_epi16(ina1, inb1);
- step1a[2] = _mm_add_epi16(ina2, inb2);
- step1a[3] = _mm_add_epi16(ina3, inb3);
- step1b[-3] = _mm_sub_epi16(ina3, inb3);
- step1b[-2] = _mm_sub_epi16(ina2, inb2);
- step1b[-1] = _mm_sub_epi16(ina1, inb1);
- step1b[-0] = _mm_sub_epi16(ina0, inb0);
- step1a[0] = _mm_slli_epi16(step1a[0], 2);
- step1a[1] = _mm_slli_epi16(step1a[1], 2);
- step1a[2] = _mm_slli_epi16(step1a[2], 2);
- step1a[3] = _mm_slli_epi16(step1a[3], 2);
- step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
- step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
- step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
- step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
- }
- } else {
- int16_t *in = &intermediate[column_start];
- // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32];
- // Note: using the same approach as above to have common offset is
- // counter-productive as all offsets can be calculated at compile
- // time.
- // Note: the next four blocks could be in a loop. That would help the
- // instruction cache but is actually slower.
- {
- __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
- __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
- __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
- __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
- __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
- __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
- __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
- __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
- step1[0] = ADD_EPI16(in00, in31);
- step1[1] = ADD_EPI16(in01, in30);
- step1[2] = ADD_EPI16(in02, in29);
- step1[3] = ADD_EPI16(in03, in28);
- step1[28] = SUB_EPI16(in03, in28);
- step1[29] = SUB_EPI16(in02, in29);
- step1[30] = SUB_EPI16(in01, in30);
- step1[31] = SUB_EPI16(in00, in31);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
- &step1[3], &step1[28], &step1[29],
- &step1[30], &step1[31]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
- __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
- __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
- __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
- __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
- __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
- __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
- __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
- step1[4] = ADD_EPI16(in04, in27);
- step1[5] = ADD_EPI16(in05, in26);
- step1[6] = ADD_EPI16(in06, in25);
- step1[7] = ADD_EPI16(in07, in24);
- step1[24] = SUB_EPI16(in07, in24);
- step1[25] = SUB_EPI16(in06, in25);
- step1[26] = SUB_EPI16(in05, in26);
- step1[27] = SUB_EPI16(in04, in27);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
- &step1[7], &step1[24], &step1[25],
- &step1[26], &step1[27]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
- __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
- __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
- __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
- __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
- __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
- __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
- __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
- step1[8] = ADD_EPI16(in08, in23);
- step1[9] = ADD_EPI16(in09, in22);
- step1[10] = ADD_EPI16(in10, in21);
- step1[11] = ADD_EPI16(in11, in20);
- step1[20] = SUB_EPI16(in11, in20);
- step1[21] = SUB_EPI16(in10, in21);
- step1[22] = SUB_EPI16(in09, in22);
- step1[23] = SUB_EPI16(in08, in23);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
- &step1[11], &step1[20], &step1[21],
- &step1[22], &step1[23]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
- __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
- __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
- __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
- __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
- __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
- __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
- __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
- step1[12] = ADD_EPI16(in12, in19);
- step1[13] = ADD_EPI16(in13, in18);
- step1[14] = ADD_EPI16(in14, in17);
- step1[15] = ADD_EPI16(in15, in16);
- step1[16] = SUB_EPI16(in15, in16);
- step1[17] = SUB_EPI16(in14, in17);
- step1[18] = SUB_EPI16(in13, in18);
- step1[19] = SUB_EPI16(in12, in19);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
- &step1[15], &step1[16], &step1[17],
- &step1[18], &step1[19]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- }
- // Stage 2
- {
- step2[0] = ADD_EPI16(step1[0], step1[15]);
- step2[1] = ADD_EPI16(step1[1], step1[14]);
- step2[2] = ADD_EPI16(step1[2], step1[13]);
- step2[3] = ADD_EPI16(step1[3], step1[12]);
- step2[4] = ADD_EPI16(step1[4], step1[11]);
- step2[5] = ADD_EPI16(step1[5], step1[10]);
- step2[6] = ADD_EPI16(step1[6], step1[9]);
- step2[7] = ADD_EPI16(step1[7], step1[8]);
- step2[8] = SUB_EPI16(step1[7], step1[8]);
- step2[9] = SUB_EPI16(step1[6], step1[9]);
- step2[10] = SUB_EPI16(step1[5], step1[10]);
- step2[11] = SUB_EPI16(step1[4], step1[11]);
- step2[12] = SUB_EPI16(step1[3], step1[12]);
- step2[13] = SUB_EPI16(step1[2], step1[13]);
- step2[14] = SUB_EPI16(step1[1], step1[14]);
- step2[15] = SUB_EPI16(step1[0], step1[15]);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x16(
- &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
- &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
- &step2[12], &step2[13], &step2[14], &step2[15]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
- const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
- const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
- const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
- const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
- const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
- const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
- const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
- const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
- const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
- const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
- const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
- const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
- const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
- const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
- const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
- const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
- const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
- const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
- const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
- const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
- const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
- const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
- const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
- const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
- const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
- const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
- const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
- const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
- const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
- const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
- const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
- const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
- const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
- const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
- const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
- const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
- const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
- const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
- // Combine
- step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
- step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
- step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
- step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
- step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
- step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
- step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
- step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
- &step2[23], &step2[24], &step2[25],
- &step2[26], &step2[27]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
-
-#if !FDCT32x32_HIGH_PRECISION
- // dump the magnitude by half, hence the intermediate values are within
- // the range of 16 bits.
- if (1 == pass) {
- __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero);
- __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero);
- __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero);
- __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero);
- __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero);
- __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero);
- __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero);
- __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero);
- __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero);
- __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero);
- __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
- __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
- __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
- __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
- __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
- __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
- __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
- __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
- __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
- __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
- __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
- __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
- __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
- __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
- __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
- __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
- __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
- __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
- __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
- __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
- __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
- __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
-
- step2[0] = SUB_EPI16(step2[0], s3_00_0);
- step2[1] = SUB_EPI16(step2[1], s3_01_0);
- step2[2] = SUB_EPI16(step2[2], s3_02_0);
- step2[3] = SUB_EPI16(step2[3], s3_03_0);
- step2[4] = SUB_EPI16(step2[4], s3_04_0);
- step2[5] = SUB_EPI16(step2[5], s3_05_0);
- step2[6] = SUB_EPI16(step2[6], s3_06_0);
- step2[7] = SUB_EPI16(step2[7], s3_07_0);
- step2[8] = SUB_EPI16(step2[8], s2_08_0);
- step2[9] = SUB_EPI16(step2[9], s2_09_0);
- step2[10] = SUB_EPI16(step2[10], s3_10_0);
- step2[11] = SUB_EPI16(step2[11], s3_11_0);
- step2[12] = SUB_EPI16(step2[12], s3_12_0);
- step2[13] = SUB_EPI16(step2[13], s3_13_0);
- step2[14] = SUB_EPI16(step2[14], s2_14_0);
- step2[15] = SUB_EPI16(step2[15], s2_15_0);
- step1[16] = SUB_EPI16(step1[16], s3_16_0);
- step1[17] = SUB_EPI16(step1[17], s3_17_0);
- step1[18] = SUB_EPI16(step1[18], s3_18_0);
- step1[19] = SUB_EPI16(step1[19], s3_19_0);
- step2[20] = SUB_EPI16(step2[20], s3_20_0);
- step2[21] = SUB_EPI16(step2[21], s3_21_0);
- step2[22] = SUB_EPI16(step2[22], s3_22_0);
- step2[23] = SUB_EPI16(step2[23], s3_23_0);
- step2[24] = SUB_EPI16(step2[24], s3_24_0);
- step2[25] = SUB_EPI16(step2[25], s3_25_0);
- step2[26] = SUB_EPI16(step2[26], s3_26_0);
- step2[27] = SUB_EPI16(step2[27], s3_27_0);
- step1[28] = SUB_EPI16(step1[28], s3_28_0);
- step1[29] = SUB_EPI16(step1[29], s3_29_0);
- step1[30] = SUB_EPI16(step1[30], s3_30_0);
- step1[31] = SUB_EPI16(step1[31], s3_31_0);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x32(
- &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
- &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
- &step2[12], &step2[13], &step2[14], &step2[15], &step1[16],
- &step1[17], &step1[18], &step1[19], &step2[20], &step2[21],
- &step2[22], &step2[23], &step2[24], &step2[25], &step2[26],
- &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- step2[0] = _mm_add_epi16(step2[0], kOne);
- step2[1] = _mm_add_epi16(step2[1], kOne);
- step2[2] = _mm_add_epi16(step2[2], kOne);
- step2[3] = _mm_add_epi16(step2[3], kOne);
- step2[4] = _mm_add_epi16(step2[4], kOne);
- step2[5] = _mm_add_epi16(step2[5], kOne);
- step2[6] = _mm_add_epi16(step2[6], kOne);
- step2[7] = _mm_add_epi16(step2[7], kOne);
- step2[8] = _mm_add_epi16(step2[8], kOne);
- step2[9] = _mm_add_epi16(step2[9], kOne);
- step2[10] = _mm_add_epi16(step2[10], kOne);
- step2[11] = _mm_add_epi16(step2[11], kOne);
- step2[12] = _mm_add_epi16(step2[12], kOne);
- step2[13] = _mm_add_epi16(step2[13], kOne);
- step2[14] = _mm_add_epi16(step2[14], kOne);
- step2[15] = _mm_add_epi16(step2[15], kOne);
- step1[16] = _mm_add_epi16(step1[16], kOne);
- step1[17] = _mm_add_epi16(step1[17], kOne);
- step1[18] = _mm_add_epi16(step1[18], kOne);
- step1[19] = _mm_add_epi16(step1[19], kOne);
- step2[20] = _mm_add_epi16(step2[20], kOne);
- step2[21] = _mm_add_epi16(step2[21], kOne);
- step2[22] = _mm_add_epi16(step2[22], kOne);
- step2[23] = _mm_add_epi16(step2[23], kOne);
- step2[24] = _mm_add_epi16(step2[24], kOne);
- step2[25] = _mm_add_epi16(step2[25], kOne);
- step2[26] = _mm_add_epi16(step2[26], kOne);
- step2[27] = _mm_add_epi16(step2[27], kOne);
- step1[28] = _mm_add_epi16(step1[28], kOne);
- step1[29] = _mm_add_epi16(step1[29], kOne);
- step1[30] = _mm_add_epi16(step1[30], kOne);
- step1[31] = _mm_add_epi16(step1[31], kOne);
-
- step2[0] = _mm_srai_epi16(step2[0], 2);
- step2[1] = _mm_srai_epi16(step2[1], 2);
- step2[2] = _mm_srai_epi16(step2[2], 2);
- step2[3] = _mm_srai_epi16(step2[3], 2);
- step2[4] = _mm_srai_epi16(step2[4], 2);
- step2[5] = _mm_srai_epi16(step2[5], 2);
- step2[6] = _mm_srai_epi16(step2[6], 2);
- step2[7] = _mm_srai_epi16(step2[7], 2);
- step2[8] = _mm_srai_epi16(step2[8], 2);
- step2[9] = _mm_srai_epi16(step2[9], 2);
- step2[10] = _mm_srai_epi16(step2[10], 2);
- step2[11] = _mm_srai_epi16(step2[11], 2);
- step2[12] = _mm_srai_epi16(step2[12], 2);
- step2[13] = _mm_srai_epi16(step2[13], 2);
- step2[14] = _mm_srai_epi16(step2[14], 2);
- step2[15] = _mm_srai_epi16(step2[15], 2);
- step1[16] = _mm_srai_epi16(step1[16], 2);
- step1[17] = _mm_srai_epi16(step1[17], 2);
- step1[18] = _mm_srai_epi16(step1[18], 2);
- step1[19] = _mm_srai_epi16(step1[19], 2);
- step2[20] = _mm_srai_epi16(step2[20], 2);
- step2[21] = _mm_srai_epi16(step2[21], 2);
- step2[22] = _mm_srai_epi16(step2[22], 2);
- step2[23] = _mm_srai_epi16(step2[23], 2);
- step2[24] = _mm_srai_epi16(step2[24], 2);
- step2[25] = _mm_srai_epi16(step2[25], 2);
- step2[26] = _mm_srai_epi16(step2[26], 2);
- step2[27] = _mm_srai_epi16(step2[27], 2);
- step1[28] = _mm_srai_epi16(step1[28], 2);
- step1[29] = _mm_srai_epi16(step1[29], 2);
- step1[30] = _mm_srai_epi16(step1[30], 2);
- step1[31] = _mm_srai_epi16(step1[31], 2);
- }
-#endif // !FDCT32x32_HIGH_PRECISION
-
-#if FDCT32x32_HIGH_PRECISION
- if (pass == 0) {
-#endif
- // Stage 3
- {
- step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
- step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
- step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
- step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
- step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
- step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
- step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
- step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
- &step3[3], &step3[4], &step3[5],
- &step3[6], &step3[7]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
- const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
- const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
- const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
- const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
- const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
- const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
- const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
- const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
- const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
- const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
- const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
- const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
- const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
- const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
- const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
- const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
- const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
- const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
- // Combine
- step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
- step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
- step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
- step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12],
- &step3[13]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- step3[16] = ADD_EPI16(step2[23], step1[16]);
- step3[17] = ADD_EPI16(step2[22], step1[17]);
- step3[18] = ADD_EPI16(step2[21], step1[18]);
- step3[19] = ADD_EPI16(step2[20], step1[19]);
- step3[20] = SUB_EPI16(step1[19], step2[20]);
- step3[21] = SUB_EPI16(step1[18], step2[21]);
- step3[22] = SUB_EPI16(step1[17], step2[22]);
- step3[23] = SUB_EPI16(step1[16], step2[23]);
- step3[24] = SUB_EPI16(step1[31], step2[24]);
- step3[25] = SUB_EPI16(step1[30], step2[25]);
- step3[26] = SUB_EPI16(step1[29], step2[26]);
- step3[27] = SUB_EPI16(step1[28], step2[27]);
- step3[28] = ADD_EPI16(step2[27], step1[28]);
- step3[29] = ADD_EPI16(step2[26], step1[29]);
- step3[30] = ADD_EPI16(step2[25], step1[30]);
- step3[31] = ADD_EPI16(step2[24], step1[31]);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x16(
- &step3[16], &step3[17], &step3[18], &step3[19], &step3[20],
- &step3[21], &step3[22], &step3[23], &step3[24], &step3[25],
- &step3[26], &step3[27], &step3[28], &step3[29], &step3[30],
- &step3[31]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
-
- // Stage 4
- {
- step1[0] = ADD_EPI16(step3[3], step3[0]);
- step1[1] = ADD_EPI16(step3[2], step3[1]);
- step1[2] = SUB_EPI16(step3[1], step3[2]);
- step1[3] = SUB_EPI16(step3[0], step3[3]);
- step1[8] = ADD_EPI16(step3[11], step2[8]);
- step1[9] = ADD_EPI16(step3[10], step2[9]);
- step1[10] = SUB_EPI16(step2[9], step3[10]);
- step1[11] = SUB_EPI16(step2[8], step3[11]);
- step1[12] = SUB_EPI16(step2[15], step3[12]);
- step1[13] = SUB_EPI16(step2[14], step3[13]);
- step1[14] = ADD_EPI16(step3[13], step2[14]);
- step1[15] = ADD_EPI16(step3[12], step2[15]);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x16(
- &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5],
- &step1[6], &step1[7], &step1[8], &step1[9], &step1[10],
- &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
- const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
- const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
- const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
- const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
- const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
- const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
- const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
- const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
- // Combine
- step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
- step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
- const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
- const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
- const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
- const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
- const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
- const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
- const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
- const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
- const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
- const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
- const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
- const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
- const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
- const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
- const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
- const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
- const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
- const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
- const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
- const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
- const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
- const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
- const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
- // dct_const_round_shift
- const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
- const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
- const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
- const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
- const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
- const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
- const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
- const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
- const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
- const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
- const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
- const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
- const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
- const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
- const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
- const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
- // Combine
- step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
- step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
- step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
- step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
- step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
- step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
- step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
- step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
- &step1[21], &step1[26], &step1[27],
- &step1[28], &step1[29]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // Stage 5
- {
- step2[4] = ADD_EPI16(step1[5], step3[4]);
- step2[5] = SUB_EPI16(step3[4], step1[5]);
- step2[6] = SUB_EPI16(step3[7], step1[6]);
- step2[7] = ADD_EPI16(step1[6], step3[7]);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6],
- &step2[7]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
- const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
- const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
- const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
- const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
- const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
- const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
- const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
- const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
- const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
- const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
- const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
- // dct_const_round_shift
- const __m128i out_00_4 =
- _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
- const __m128i out_00_5 =
- _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
- const __m128i out_16_4 =
- _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
- const __m128i out_16_5 =
- _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
- const __m128i out_08_4 =
- _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
- const __m128i out_08_5 =
- _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
- const __m128i out_24_4 =
- _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
- const __m128i out_24_5 =
- _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
- const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
- const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
- const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
- const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
- const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
- const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
- const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
- const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
- // Combine
- out[0] = _mm_packs_epi32(out_00_6, out_00_7);
- out[16] = _mm_packs_epi32(out_16_6, out_16_7);
- out[8] = _mm_packs_epi32(out_08_6, out_08_7);
- out[24] = _mm_packs_epi32(out_24_6, out_24_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
- const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
- const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
- const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
- const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
- const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
- const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
- const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
- const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
- const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
- const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
- const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
- // dct_const_round_shift
- const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
- const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
- const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
- const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
- const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
- const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
- const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
- const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
- // Combine
- step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
- step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
- step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
- step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13],
- &step2[14]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- step2[16] = ADD_EPI16(step1[19], step3[16]);
- step2[17] = ADD_EPI16(step1[18], step3[17]);
- step2[18] = SUB_EPI16(step3[17], step1[18]);
- step2[19] = SUB_EPI16(step3[16], step1[19]);
- step2[20] = SUB_EPI16(step3[23], step1[20]);
- step2[21] = SUB_EPI16(step3[22], step1[21]);
- step2[22] = ADD_EPI16(step1[21], step3[22]);
- step2[23] = ADD_EPI16(step1[20], step3[23]);
- step2[24] = ADD_EPI16(step1[27], step3[24]);
- step2[25] = ADD_EPI16(step1[26], step3[25]);
- step2[26] = SUB_EPI16(step3[25], step1[26]);
- step2[27] = SUB_EPI16(step3[24], step1[27]);
- step2[28] = SUB_EPI16(step3[31], step1[28]);
- step2[29] = SUB_EPI16(step3[30], step1[29]);
- step2[30] = ADD_EPI16(step1[29], step3[30]);
- step2[31] = ADD_EPI16(step1[28], step3[31]);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x16(
- &step2[16], &step2[17], &step2[18], &step2[19], &step2[20],
- &step2[21], &step2[22], &step2[23], &step2[24], &step2[25],
- &step2[26], &step2[27], &step2[28], &step2[29], &step2[30],
- &step2[31]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // Stage 6
- {
- const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
- const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
- const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
- const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
- const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
- const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
- const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
- const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
- const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
- const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
- const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
- const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
- const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
- const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
- const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
- const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
- // dct_const_round_shift
- const __m128i out_04_4 =
- _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
- const __m128i out_04_5 =
- _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
- const __m128i out_20_4 =
- _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
- const __m128i out_20_5 =
- _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
- const __m128i out_12_4 =
- _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
- const __m128i out_12_5 =
- _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
- const __m128i out_28_4 =
- _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
- const __m128i out_28_5 =
- _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
- const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
- const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
- const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
- const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
- const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
- const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
- const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
- const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
- // Combine
- out[4] = _mm_packs_epi32(out_04_6, out_04_7);
- out[20] = _mm_packs_epi32(out_20_6, out_20_7);
- out[12] = _mm_packs_epi32(out_12_6, out_12_7);
- out[28] = _mm_packs_epi32(out_28_6, out_28_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- step3[8] = ADD_EPI16(step2[9], step1[8]);
- step3[9] = SUB_EPI16(step1[8], step2[9]);
- step3[10] = SUB_EPI16(step1[11], step2[10]);
- step3[11] = ADD_EPI16(step2[10], step1[11]);
- step3[12] = ADD_EPI16(step2[13], step1[12]);
- step3[13] = SUB_EPI16(step1[12], step2[13]);
- step3[14] = SUB_EPI16(step1[15], step2[14]);
- step3[15] = ADD_EPI16(step2[14], step1[15]);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
- &step3[11], &step3[12], &step3[13],
- &step3[14], &step3[15]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
- const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
- const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
- const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
- const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
- const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
- const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
- const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
- const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
- const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
- const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
- const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
- const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
- const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
- const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
- const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
- const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
- const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
- const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
- const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
- const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
- const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
- const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
- const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
- // dct_const_round_shift
- const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
- const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
- const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
- const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
- const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
- const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
- const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
- const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
- const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
- const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
- const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
- const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
- const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
- const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
- const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
- const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
- // Combine
- step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
- step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
- step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
- step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
- // Combine
- step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
- step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
- step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
- step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
- &step3[22], &step3[25], &step3[26],
- &step3[29], &step3[30]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // Stage 7
- {
- const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
- const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
- const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
- const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
- const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
- const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
- const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
- const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
- const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
- const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
- const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
- const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
- const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
- const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
- const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
- const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
- const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
- const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
- const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
- const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
- const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
- const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
- const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
- const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
- // dct_const_round_shift
- const __m128i out_02_4 =
- _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
- const __m128i out_02_5 =
- _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
- const __m128i out_18_4 =
- _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
- const __m128i out_18_5 =
- _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
- const __m128i out_10_4 =
- _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
- const __m128i out_10_5 =
- _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
- const __m128i out_26_4 =
- _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
- const __m128i out_26_5 =
- _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
- const __m128i out_06_4 =
- _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
- const __m128i out_06_5 =
- _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
- const __m128i out_22_4 =
- _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
- const __m128i out_22_5 =
- _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
- const __m128i out_14_4 =
- _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
- const __m128i out_14_5 =
- _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
- const __m128i out_30_4 =
- _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
- const __m128i out_30_5 =
- _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
- const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
- const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
- const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
- const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
- const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
- const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
- const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
- const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
- const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
- const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
- const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
- const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
- const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
- const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
- const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
- const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
- // Combine
- out[2] = _mm_packs_epi32(out_02_6, out_02_7);
- out[18] = _mm_packs_epi32(out_18_6, out_18_7);
- out[10] = _mm_packs_epi32(out_10_6, out_10_7);
- out[26] = _mm_packs_epi32(out_26_6, out_26_7);
- out[6] = _mm_packs_epi32(out_06_6, out_06_7);
- out[22] = _mm_packs_epi32(out_22_6, out_22_7);
- out[14] = _mm_packs_epi32(out_14_6, out_14_7);
- out[30] = _mm_packs_epi32(out_30_6, out_30_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
- &out[6], &out[22], &out[14], &out[30]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- step1[16] = ADD_EPI16(step3[17], step2[16]);
- step1[17] = SUB_EPI16(step2[16], step3[17]);
- step1[18] = SUB_EPI16(step2[19], step3[18]);
- step1[19] = ADD_EPI16(step3[18], step2[19]);
- step1[20] = ADD_EPI16(step3[21], step2[20]);
- step1[21] = SUB_EPI16(step2[20], step3[21]);
- step1[22] = SUB_EPI16(step2[23], step3[22]);
- step1[23] = ADD_EPI16(step3[22], step2[23]);
- step1[24] = ADD_EPI16(step3[25], step2[24]);
- step1[25] = SUB_EPI16(step2[24], step3[25]);
- step1[26] = SUB_EPI16(step2[27], step3[26]);
- step1[27] = ADD_EPI16(step3[26], step2[27]);
- step1[28] = ADD_EPI16(step3[29], step2[28]);
- step1[29] = SUB_EPI16(step2[28], step3[29]);
- step1[30] = SUB_EPI16(step2[31], step3[30]);
- step1[31] = ADD_EPI16(step3[30], step2[31]);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x16(
- &step1[16], &step1[17], &step1[18], &step1[19], &step1[20],
- &step1[21], &step1[22], &step1[23], &step1[24], &step1[25],
- &step1[26], &step1[27], &step1[28], &step1[29], &step1[30],
- &step1[31]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // Final stage --- outputs indices are bit-reversed.
- {
- const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
- const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
- const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
- const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
- const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
- const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
- const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
- const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
- const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
- const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
- const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
- const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
- const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
- const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
- const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
- const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
- const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
- const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
- const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
- const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
- const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
- const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
- const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
- const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
- // dct_const_round_shift
- const __m128i out_01_4 =
- _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
- const __m128i out_01_5 =
- _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
- const __m128i out_17_4 =
- _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
- const __m128i out_17_5 =
- _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
- const __m128i out_09_4 =
- _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
- const __m128i out_09_5 =
- _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
- const __m128i out_25_4 =
- _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
- const __m128i out_25_5 =
- _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
- const __m128i out_07_4 =
- _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
- const __m128i out_07_5 =
- _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
- const __m128i out_23_4 =
- _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
- const __m128i out_23_5 =
- _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
- const __m128i out_15_4 =
- _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
- const __m128i out_15_5 =
- _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
- const __m128i out_31_4 =
- _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
- const __m128i out_31_5 =
- _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
- const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
- const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
- const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
- const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
- const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
- const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
- const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
- const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
- const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
- const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
- const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
- const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
- const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
- const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
- const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
- const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
- // Combine
- out[1] = _mm_packs_epi32(out_01_6, out_01_7);
- out[17] = _mm_packs_epi32(out_17_6, out_17_7);
- out[9] = _mm_packs_epi32(out_09_6, out_09_7);
- out[25] = _mm_packs_epi32(out_25_6, out_25_7);
- out[7] = _mm_packs_epi32(out_07_6, out_07_7);
- out[23] = _mm_packs_epi32(out_23_6, out_23_7);
- out[15] = _mm_packs_epi32(out_15_6, out_15_7);
- out[31] = _mm_packs_epi32(out_31_6, out_31_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
- &out[7], &out[23], &out[15], &out[31]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
- const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
- const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
- const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
- const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
- const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
- const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
- const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
- const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
- const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
- const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
- const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
- const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
- const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
- const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
- const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
- const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
- const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
- const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
- const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
- const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
- const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
- const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
- const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
- // dct_const_round_shift
- const __m128i out_05_4 =
- _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
- const __m128i out_05_5 =
- _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
- const __m128i out_21_4 =
- _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
- const __m128i out_21_5 =
- _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
- const __m128i out_13_4 =
- _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
- const __m128i out_13_5 =
- _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
- const __m128i out_29_4 =
- _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
- const __m128i out_29_5 =
- _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
- const __m128i out_03_4 =
- _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
- const __m128i out_03_5 =
- _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
- const __m128i out_19_4 =
- _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
- const __m128i out_19_5 =
- _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
- const __m128i out_11_4 =
- _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
- const __m128i out_11_5 =
- _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
- const __m128i out_27_4 =
- _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
- const __m128i out_27_5 =
- _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
- const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
- const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
- const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
- const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
- const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
- const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
- const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
- const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
- const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
- const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
- const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
- const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
- const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
- const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
- const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
- const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
- // Combine
- out[5] = _mm_packs_epi32(out_05_6, out_05_7);
- out[21] = _mm_packs_epi32(out_21_6, out_21_7);
- out[13] = _mm_packs_epi32(out_13_6, out_13_7);
- out[29] = _mm_packs_epi32(out_29_6, out_29_7);
- out[3] = _mm_packs_epi32(out_03_6, out_03_7);
- out[19] = _mm_packs_epi32(out_19_6, out_19_7);
- out[11] = _mm_packs_epi32(out_11_6, out_11_7);
- out[27] = _mm_packs_epi32(out_27_6, out_27_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
- &out[3], &out[19], &out[11], &out[27]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
-#if FDCT32x32_HIGH_PRECISION
- } else {
- __m128i lstep1[64], lstep2[64], lstep3[64];
- __m128i u[32], v[32], sign[16];
- const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
- // start using 32-bit operations
- // stage 3
- {
- // expanding to 32-bit length priori to addition operations
- lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero);
- lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero);
- lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero);
- lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero);
- lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero);
- lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero);
- lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero);
- lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero);
- lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero);
- lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero);
- lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero);
- lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero);
- lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero);
- lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero);
- lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero);
- lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero);
- lstep2[0] = _mm_madd_epi16(lstep2[0], kOne);
- lstep2[1] = _mm_madd_epi16(lstep2[1], kOne);
- lstep2[2] = _mm_madd_epi16(lstep2[2], kOne);
- lstep2[3] = _mm_madd_epi16(lstep2[3], kOne);
- lstep2[4] = _mm_madd_epi16(lstep2[4], kOne);
- lstep2[5] = _mm_madd_epi16(lstep2[5], kOne);
- lstep2[6] = _mm_madd_epi16(lstep2[6], kOne);
- lstep2[7] = _mm_madd_epi16(lstep2[7], kOne);
- lstep2[8] = _mm_madd_epi16(lstep2[8], kOne);
- lstep2[9] = _mm_madd_epi16(lstep2[9], kOne);
- lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
- lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
- lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
- lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
- lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
- lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
-
- lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]);
- lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]);
- lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]);
- lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]);
- lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]);
- lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]);
- lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]);
- lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]);
- lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]);
- lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]);
- lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]);
- lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]);
- lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]);
- lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]);
- lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]);
- lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]);
- }
- {
- const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
- const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
- const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
- const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
- const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
- const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
- const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
- const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
- const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
- const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
- const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
- const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
- lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
- lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
- lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
- lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
- lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
- lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
- lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
- lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
- }
- {
- lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
- lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
- lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
- lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
- lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
- lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
- lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
- lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
- lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
- lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
- lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
- lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
- lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
- lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
- lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
- lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
- lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
- lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
- lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
- lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
- lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
- lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
- lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
- lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
- lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
- lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
- lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
- lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
- lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
- lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
- lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
- lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
-
- lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
- lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
- lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
- lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
- lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
- lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
- lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
- lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
- lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
- lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
- lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
- lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
- lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
- lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
- lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
- lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
- lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
- lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
- lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
- lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
- lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
- lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
- lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
- lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
- lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
- lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
- lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
- lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
- lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
- lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
- lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
- lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
-
- lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
- lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
-
- lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
- lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
- lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
- lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
- lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
- lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
- lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
- lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
- lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
- lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
- lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
- lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
- lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
- lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
- lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
- lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
- lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
- lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
- lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
- lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
- lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
- lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
- lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
- lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
- lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
- lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
- lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
- lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
- lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
- lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
- }
-
- // stage 4
- {
- // expanding to 32-bit length priori to addition operations
- lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero);
- lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero);
- lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero);
- lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero);
- lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
- lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
- lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
- lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
- lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
- lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
- lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
- lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
- lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
- lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
- lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
- lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
-
- lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]);
- lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]);
- lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]);
- lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]);
- lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]);
- lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]);
- lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]);
- lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]);
- lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
- lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
- lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
- lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
- lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
- lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
- lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
- lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
- lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
- lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
- lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
- lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
- lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
- lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
- lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
- lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
- }
- {
- // to be continued...
- //
- const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
- const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
-
- u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
- u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
- u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
- u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
-
- // TODO(jingning): manually inline k_madd_epi32_ to further hide
- // instruction latency.
- v[0] = k_madd_epi32(u[0], k32_p16_m16);
- v[1] = k_madd_epi32(u[1], k32_p16_m16);
- v[2] = k_madd_epi32(u[2], k32_p16_m16);
- v[3] = k_madd_epi32(u[3], k32_p16_m16);
- v[4] = k_madd_epi32(u[0], k32_p16_p16);
- v[5] = k_madd_epi32(u[1], k32_p16_p16);
- v[6] = k_madd_epi32(u[2], k32_p16_p16);
- v[7] = k_madd_epi32(u[3], k32_p16_p16);
-#if DCT_HIGH_BIT_DEPTH
- overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4],
- &v[5], &v[6], &v[7], &kZero);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-
- lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- }
- {
- const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
- const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
- const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
-
- u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
- u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
- u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
- u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
- u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
- u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
- u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
- u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
- u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
- u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
- u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
- u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
- u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
- u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
- u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
- u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
-
- v[0] = k_madd_epi32(u[0], k32_m08_p24);
- v[1] = k_madd_epi32(u[1], k32_m08_p24);
- v[2] = k_madd_epi32(u[2], k32_m08_p24);
- v[3] = k_madd_epi32(u[3], k32_m08_p24);
- v[4] = k_madd_epi32(u[4], k32_m08_p24);
- v[5] = k_madd_epi32(u[5], k32_m08_p24);
- v[6] = k_madd_epi32(u[6], k32_m08_p24);
- v[7] = k_madd_epi32(u[7], k32_m08_p24);
- v[8] = k_madd_epi32(u[8], k32_m24_m08);
- v[9] = k_madd_epi32(u[9], k32_m24_m08);
- v[10] = k_madd_epi32(u[10], k32_m24_m08);
- v[11] = k_madd_epi32(u[11], k32_m24_m08);
- v[12] = k_madd_epi32(u[12], k32_m24_m08);
- v[13] = k_madd_epi32(u[13], k32_m24_m08);
- v[14] = k_madd_epi32(u[14], k32_m24_m08);
- v[15] = k_madd_epi32(u[15], k32_m24_m08);
- v[16] = k_madd_epi32(u[12], k32_m08_p24);
- v[17] = k_madd_epi32(u[13], k32_m08_p24);
- v[18] = k_madd_epi32(u[14], k32_m08_p24);
- v[19] = k_madd_epi32(u[15], k32_m08_p24);
- v[20] = k_madd_epi32(u[8], k32_m08_p24);
- v[21] = k_madd_epi32(u[9], k32_m08_p24);
- v[22] = k_madd_epi32(u[10], k32_m08_p24);
- v[23] = k_madd_epi32(u[11], k32_m08_p24);
- v[24] = k_madd_epi32(u[4], k32_p24_p08);
- v[25] = k_madd_epi32(u[5], k32_p24_p08);
- v[26] = k_madd_epi32(u[6], k32_p24_p08);
- v[27] = k_madd_epi32(u[7], k32_p24_p08);
- v[28] = k_madd_epi32(u[0], k32_p24_p08);
- v[29] = k_madd_epi32(u[1], k32_p24_p08);
- v[30] = k_madd_epi32(u[2], k32_p24_p08);
- v[31] = k_madd_epi32(u[3], k32_p24_p08);
-
-#if DCT_HIGH_BIT_DEPTH
- overflow = k_check_epi32_overflow_32(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
- &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
- &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
- &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
- u[4] = k_packs_epi64(v[8], v[9]);
- u[5] = k_packs_epi64(v[10], v[11]);
- u[6] = k_packs_epi64(v[12], v[13]);
- u[7] = k_packs_epi64(v[14], v[15]);
- u[8] = k_packs_epi64(v[16], v[17]);
- u[9] = k_packs_epi64(v[18], v[19]);
- u[10] = k_packs_epi64(v[20], v[21]);
- u[11] = k_packs_epi64(v[22], v[23]);
- u[12] = k_packs_epi64(v[24], v[25]);
- u[13] = k_packs_epi64(v[26], v[27]);
- u[14] = k_packs_epi64(v[28], v[29]);
- u[15] = k_packs_epi64(v[30], v[31]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
- }
- // stage 5
- {
- lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]);
- lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]);
- lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]);
- lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]);
- lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
- lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
- lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
- lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
- }
- {
- const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
- const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
- const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
- const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
-
- u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
- u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
- u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
- u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
- u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
- u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
- u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
- u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
-
- // TODO(jingning): manually inline k_madd_epi32_ to further hide
- // instruction latency.
- v[0] = k_madd_epi32(u[0], k32_p16_p16);
- v[1] = k_madd_epi32(u[1], k32_p16_p16);
- v[2] = k_madd_epi32(u[2], k32_p16_p16);
- v[3] = k_madd_epi32(u[3], k32_p16_p16);
- v[4] = k_madd_epi32(u[0], k32_p16_m16);
- v[5] = k_madd_epi32(u[1], k32_p16_m16);
- v[6] = k_madd_epi32(u[2], k32_p16_m16);
- v[7] = k_madd_epi32(u[3], k32_p16_m16);
- v[8] = k_madd_epi32(u[4], k32_p24_p08);
- v[9] = k_madd_epi32(u[5], k32_p24_p08);
- v[10] = k_madd_epi32(u[6], k32_p24_p08);
- v[11] = k_madd_epi32(u[7], k32_p24_p08);
- v[12] = k_madd_epi32(u[4], k32_m08_p24);
- v[13] = k_madd_epi32(u[5], k32_m08_p24);
- v[14] = k_madd_epi32(u[6], k32_m08_p24);
- v[15] = k_madd_epi32(u[7], k32_m08_p24);
-
-#if DCT_HIGH_BIT_DEPTH
- overflow = k_check_epi32_overflow_16(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
- &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
- u[4] = k_packs_epi64(v[8], v[9]);
- u[5] = k_packs_epi64(v[10], v[11]);
- u[6] = k_packs_epi64(v[12], v[13]);
- u[7] = k_packs_epi64(v[14], v[15]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
- sign[0] = _mm_cmplt_epi32(u[0], kZero);
- sign[1] = _mm_cmplt_epi32(u[1], kZero);
- sign[2] = _mm_cmplt_epi32(u[2], kZero);
- sign[3] = _mm_cmplt_epi32(u[3], kZero);
- sign[4] = _mm_cmplt_epi32(u[4], kZero);
- sign[5] = _mm_cmplt_epi32(u[5], kZero);
- sign[6] = _mm_cmplt_epi32(u[6], kZero);
- sign[7] = _mm_cmplt_epi32(u[7], kZero);
-
- u[0] = _mm_sub_epi32(u[0], sign[0]);
- u[1] = _mm_sub_epi32(u[1], sign[1]);
- u[2] = _mm_sub_epi32(u[2], sign[2]);
- u[3] = _mm_sub_epi32(u[3], sign[3]);
- u[4] = _mm_sub_epi32(u[4], sign[4]);
- u[5] = _mm_sub_epi32(u[5], sign[5]);
- u[6] = _mm_sub_epi32(u[6], sign[6]);
- u[7] = _mm_sub_epi32(u[7], sign[7]);
-
- u[0] = _mm_add_epi32(u[0], K32One);
- u[1] = _mm_add_epi32(u[1], K32One);
- u[2] = _mm_add_epi32(u[2], K32One);
- u[3] = _mm_add_epi32(u[3], K32One);
- u[4] = _mm_add_epi32(u[4], K32One);
- u[5] = _mm_add_epi32(u[5], K32One);
- u[6] = _mm_add_epi32(u[6], K32One);
- u[7] = _mm_add_epi32(u[7], K32One);
-
- u[0] = _mm_srai_epi32(u[0], 2);
- u[1] = _mm_srai_epi32(u[1], 2);
- u[2] = _mm_srai_epi32(u[2], 2);
- u[3] = _mm_srai_epi32(u[3], 2);
- u[4] = _mm_srai_epi32(u[4], 2);
- u[5] = _mm_srai_epi32(u[5], 2);
- u[6] = _mm_srai_epi32(u[6], 2);
- u[7] = _mm_srai_epi32(u[7], 2);
-
- // Combine
- out[0] = _mm_packs_epi32(u[0], u[1]);
- out[16] = _mm_packs_epi32(u[2], u[3]);
- out[8] = _mm_packs_epi32(u[4], u[5]);
- out[24] = _mm_packs_epi32(u[6], u[7]);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
- const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
- const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
-
- u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
- u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
- u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
- u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
- u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
- u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
- u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
- u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
-
- v[0] = k_madd_epi32(u[0], k32_m08_p24);
- v[1] = k_madd_epi32(u[1], k32_m08_p24);
- v[2] = k_madd_epi32(u[2], k32_m08_p24);
- v[3] = k_madd_epi32(u[3], k32_m08_p24);
- v[4] = k_madd_epi32(u[4], k32_m24_m08);
- v[5] = k_madd_epi32(u[5], k32_m24_m08);
- v[6] = k_madd_epi32(u[6], k32_m24_m08);
- v[7] = k_madd_epi32(u[7], k32_m24_m08);
- v[8] = k_madd_epi32(u[4], k32_m08_p24);
- v[9] = k_madd_epi32(u[5], k32_m08_p24);
- v[10] = k_madd_epi32(u[6], k32_m08_p24);
- v[11] = k_madd_epi32(u[7], k32_m08_p24);
- v[12] = k_madd_epi32(u[0], k32_p24_p08);
- v[13] = k_madd_epi32(u[1], k32_p24_p08);
- v[14] = k_madd_epi32(u[2], k32_p24_p08);
- v[15] = k_madd_epi32(u[3], k32_p24_p08);
-
-#if DCT_HIGH_BIT_DEPTH
- overflow = k_check_epi32_overflow_16(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
- &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
- u[4] = k_packs_epi64(v[8], v[9]);
- u[5] = k_packs_epi64(v[10], v[11]);
- u[6] = k_packs_epi64(v[12], v[13]);
- u[7] = k_packs_epi64(v[14], v[15]);
-
- u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
- lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- }
- {
- lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
- lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
- lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
- lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
- lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
- lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
- lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
- lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
- lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
- lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
- lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
- lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
- lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
- lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
- lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
- lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
- lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
- lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
- lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
- lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
- lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
- lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
- lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
- lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
- lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
- lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
- lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
- lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
- lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
- lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
- lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
- lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
- }
- // stage 6
- {
- const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
- const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
- const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
- const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
-
- u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
- u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
- u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
- u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
- u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
- u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
- u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
- u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
- u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
- u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
- u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
- u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
- u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
- u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
- u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
- u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
-
- v[0] = k_madd_epi32(u[0], k32_p28_p04);
- v[1] = k_madd_epi32(u[1], k32_p28_p04);
- v[2] = k_madd_epi32(u[2], k32_p28_p04);
- v[3] = k_madd_epi32(u[3], k32_p28_p04);
- v[4] = k_madd_epi32(u[4], k32_p12_p20);
- v[5] = k_madd_epi32(u[5], k32_p12_p20);
- v[6] = k_madd_epi32(u[6], k32_p12_p20);
- v[7] = k_madd_epi32(u[7], k32_p12_p20);
- v[8] = k_madd_epi32(u[8], k32_m20_p12);
- v[9] = k_madd_epi32(u[9], k32_m20_p12);
- v[10] = k_madd_epi32(u[10], k32_m20_p12);
- v[11] = k_madd_epi32(u[11], k32_m20_p12);
- v[12] = k_madd_epi32(u[12], k32_m04_p28);
- v[13] = k_madd_epi32(u[13], k32_m04_p28);
- v[14] = k_madd_epi32(u[14], k32_m04_p28);
- v[15] = k_madd_epi32(u[15], k32_m04_p28);
-
-#if DCT_HIGH_BIT_DEPTH
- overflow = k_check_epi32_overflow_16(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
- &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
- u[4] = k_packs_epi64(v[8], v[9]);
- u[5] = k_packs_epi64(v[10], v[11]);
- u[6] = k_packs_epi64(v[12], v[13]);
- u[7] = k_packs_epi64(v[14], v[15]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
- sign[0] = _mm_cmplt_epi32(u[0], kZero);
- sign[1] = _mm_cmplt_epi32(u[1], kZero);
- sign[2] = _mm_cmplt_epi32(u[2], kZero);
- sign[3] = _mm_cmplt_epi32(u[3], kZero);
- sign[4] = _mm_cmplt_epi32(u[4], kZero);
- sign[5] = _mm_cmplt_epi32(u[5], kZero);
- sign[6] = _mm_cmplt_epi32(u[6], kZero);
- sign[7] = _mm_cmplt_epi32(u[7], kZero);
-
- u[0] = _mm_sub_epi32(u[0], sign[0]);
- u[1] = _mm_sub_epi32(u[1], sign[1]);
- u[2] = _mm_sub_epi32(u[2], sign[2]);
- u[3] = _mm_sub_epi32(u[3], sign[3]);
- u[4] = _mm_sub_epi32(u[4], sign[4]);
- u[5] = _mm_sub_epi32(u[5], sign[5]);
- u[6] = _mm_sub_epi32(u[6], sign[6]);
- u[7] = _mm_sub_epi32(u[7], sign[7]);
-
- u[0] = _mm_add_epi32(u[0], K32One);
- u[1] = _mm_add_epi32(u[1], K32One);
- u[2] = _mm_add_epi32(u[2], K32One);
- u[3] = _mm_add_epi32(u[3], K32One);
- u[4] = _mm_add_epi32(u[4], K32One);
- u[5] = _mm_add_epi32(u[5], K32One);
- u[6] = _mm_add_epi32(u[6], K32One);
- u[7] = _mm_add_epi32(u[7], K32One);
-
- u[0] = _mm_srai_epi32(u[0], 2);
- u[1] = _mm_srai_epi32(u[1], 2);
- u[2] = _mm_srai_epi32(u[2], 2);
- u[3] = _mm_srai_epi32(u[3], 2);
- u[4] = _mm_srai_epi32(u[4], 2);
- u[5] = _mm_srai_epi32(u[5], 2);
- u[6] = _mm_srai_epi32(u[6], 2);
- u[7] = _mm_srai_epi32(u[7], 2);
-
- out[4] = _mm_packs_epi32(u[0], u[1]);
- out[20] = _mm_packs_epi32(u[2], u[3]);
- out[12] = _mm_packs_epi32(u[4], u[5]);
- out[28] = _mm_packs_epi32(u[6], u[7]);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
- lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
- lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
- lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
- lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
- lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
- lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
- lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
- lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
- lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
- lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
- lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
- lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
- lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
- lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
- lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
- }
- {
- const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
- const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
- const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
- const __m128i k32_m12_m20 =
- pair_set_epi32(-cospi_12_64, -cospi_20_64);
- const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
- const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
-
- u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
- u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
- u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
- u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
- u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
- u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
- u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
- u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
- u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
- u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
- u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
- u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
- u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
- u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
- u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
- u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
-
- v[0] = k_madd_epi32(u[0], k32_m04_p28);
- v[1] = k_madd_epi32(u[1], k32_m04_p28);
- v[2] = k_madd_epi32(u[2], k32_m04_p28);
- v[3] = k_madd_epi32(u[3], k32_m04_p28);
- v[4] = k_madd_epi32(u[4], k32_m28_m04);
- v[5] = k_madd_epi32(u[5], k32_m28_m04);
- v[6] = k_madd_epi32(u[6], k32_m28_m04);
- v[7] = k_madd_epi32(u[7], k32_m28_m04);
- v[8] = k_madd_epi32(u[8], k32_m20_p12);
- v[9] = k_madd_epi32(u[9], k32_m20_p12);
- v[10] = k_madd_epi32(u[10], k32_m20_p12);
- v[11] = k_madd_epi32(u[11], k32_m20_p12);
- v[12] = k_madd_epi32(u[12], k32_m12_m20);
- v[13] = k_madd_epi32(u[13], k32_m12_m20);
- v[14] = k_madd_epi32(u[14], k32_m12_m20);
- v[15] = k_madd_epi32(u[15], k32_m12_m20);
- v[16] = k_madd_epi32(u[12], k32_m20_p12);
- v[17] = k_madd_epi32(u[13], k32_m20_p12);
- v[18] = k_madd_epi32(u[14], k32_m20_p12);
- v[19] = k_madd_epi32(u[15], k32_m20_p12);
- v[20] = k_madd_epi32(u[8], k32_p12_p20);
- v[21] = k_madd_epi32(u[9], k32_p12_p20);
- v[22] = k_madd_epi32(u[10], k32_p12_p20);
- v[23] = k_madd_epi32(u[11], k32_p12_p20);
- v[24] = k_madd_epi32(u[4], k32_m04_p28);
- v[25] = k_madd_epi32(u[5], k32_m04_p28);
- v[26] = k_madd_epi32(u[6], k32_m04_p28);
- v[27] = k_madd_epi32(u[7], k32_m04_p28);
- v[28] = k_madd_epi32(u[0], k32_p28_p04);
- v[29] = k_madd_epi32(u[1], k32_p28_p04);
- v[30] = k_madd_epi32(u[2], k32_p28_p04);
- v[31] = k_madd_epi32(u[3], k32_p28_p04);
-
-#if DCT_HIGH_BIT_DEPTH
- overflow = k_check_epi32_overflow_32(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
- &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
- &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
- &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
- u[4] = k_packs_epi64(v[8], v[9]);
- u[5] = k_packs_epi64(v[10], v[11]);
- u[6] = k_packs_epi64(v[12], v[13]);
- u[7] = k_packs_epi64(v[14], v[15]);
- u[8] = k_packs_epi64(v[16], v[17]);
- u[9] = k_packs_epi64(v[18], v[19]);
- u[10] = k_packs_epi64(v[20], v[21]);
- u[11] = k_packs_epi64(v[22], v[23]);
- u[12] = k_packs_epi64(v[24], v[25]);
- u[13] = k_packs_epi64(v[26], v[27]);
- u[14] = k_packs_epi64(v[28], v[29]);
- u[15] = k_packs_epi64(v[30], v[31]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
- }
- // stage 7
- {
- const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
- const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
- const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
- const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64);
- const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
- const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
- const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
- const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
-
- u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
- u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
- u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
- u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
- u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
- u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
- u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
- u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
- u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
- u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
- u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
- u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
- u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
- u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
- u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
- u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
-
- v[0] = k_madd_epi32(u[0], k32_p30_p02);
- v[1] = k_madd_epi32(u[1], k32_p30_p02);
- v[2] = k_madd_epi32(u[2], k32_p30_p02);
- v[3] = k_madd_epi32(u[3], k32_p30_p02);
- v[4] = k_madd_epi32(u[4], k32_p14_p18);
- v[5] = k_madd_epi32(u[5], k32_p14_p18);
- v[6] = k_madd_epi32(u[6], k32_p14_p18);
- v[7] = k_madd_epi32(u[7], k32_p14_p18);
- v[8] = k_madd_epi32(u[8], k32_p22_p10);
- v[9] = k_madd_epi32(u[9], k32_p22_p10);
- v[10] = k_madd_epi32(u[10], k32_p22_p10);
- v[11] = k_madd_epi32(u[11], k32_p22_p10);
- v[12] = k_madd_epi32(u[12], k32_p06_p26);
- v[13] = k_madd_epi32(u[13], k32_p06_p26);
- v[14] = k_madd_epi32(u[14], k32_p06_p26);
- v[15] = k_madd_epi32(u[15], k32_p06_p26);
- v[16] = k_madd_epi32(u[12], k32_m26_p06);
- v[17] = k_madd_epi32(u[13], k32_m26_p06);
- v[18] = k_madd_epi32(u[14], k32_m26_p06);
- v[19] = k_madd_epi32(u[15], k32_m26_p06);
- v[20] = k_madd_epi32(u[8], k32_m10_p22);
- v[21] = k_madd_epi32(u[9], k32_m10_p22);
- v[22] = k_madd_epi32(u[10], k32_m10_p22);
- v[23] = k_madd_epi32(u[11], k32_m10_p22);
- v[24] = k_madd_epi32(u[4], k32_m18_p14);
- v[25] = k_madd_epi32(u[5], k32_m18_p14);
- v[26] = k_madd_epi32(u[6], k32_m18_p14);
- v[27] = k_madd_epi32(u[7], k32_m18_p14);
- v[28] = k_madd_epi32(u[0], k32_m02_p30);
- v[29] = k_madd_epi32(u[1], k32_m02_p30);
- v[30] = k_madd_epi32(u[2], k32_m02_p30);
- v[31] = k_madd_epi32(u[3], k32_m02_p30);
-
-#if DCT_HIGH_BIT_DEPTH
- overflow = k_check_epi32_overflow_32(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
- &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
- &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
- &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
- u[4] = k_packs_epi64(v[8], v[9]);
- u[5] = k_packs_epi64(v[10], v[11]);
- u[6] = k_packs_epi64(v[12], v[13]);
- u[7] = k_packs_epi64(v[14], v[15]);
- u[8] = k_packs_epi64(v[16], v[17]);
- u[9] = k_packs_epi64(v[18], v[19]);
- u[10] = k_packs_epi64(v[20], v[21]);
- u[11] = k_packs_epi64(v[22], v[23]);
- u[12] = k_packs_epi64(v[24], v[25]);
- u[13] = k_packs_epi64(v[26], v[27]);
- u[14] = k_packs_epi64(v[28], v[29]);
- u[15] = k_packs_epi64(v[30], v[31]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
- v[0] = _mm_cmplt_epi32(u[0], kZero);
- v[1] = _mm_cmplt_epi32(u[1], kZero);
- v[2] = _mm_cmplt_epi32(u[2], kZero);
- v[3] = _mm_cmplt_epi32(u[3], kZero);
- v[4] = _mm_cmplt_epi32(u[4], kZero);
- v[5] = _mm_cmplt_epi32(u[5], kZero);
- v[6] = _mm_cmplt_epi32(u[6], kZero);
- v[7] = _mm_cmplt_epi32(u[7], kZero);
- v[8] = _mm_cmplt_epi32(u[8], kZero);
- v[9] = _mm_cmplt_epi32(u[9], kZero);
- v[10] = _mm_cmplt_epi32(u[10], kZero);
- v[11] = _mm_cmplt_epi32(u[11], kZero);
- v[12] = _mm_cmplt_epi32(u[12], kZero);
- v[13] = _mm_cmplt_epi32(u[13], kZero);
- v[14] = _mm_cmplt_epi32(u[14], kZero);
- v[15] = _mm_cmplt_epi32(u[15], kZero);
-
- u[0] = _mm_sub_epi32(u[0], v[0]);
- u[1] = _mm_sub_epi32(u[1], v[1]);
- u[2] = _mm_sub_epi32(u[2], v[2]);
- u[3] = _mm_sub_epi32(u[3], v[3]);
- u[4] = _mm_sub_epi32(u[4], v[4]);
- u[5] = _mm_sub_epi32(u[5], v[5]);
- u[6] = _mm_sub_epi32(u[6], v[6]);
- u[7] = _mm_sub_epi32(u[7], v[7]);
- u[8] = _mm_sub_epi32(u[8], v[8]);
- u[9] = _mm_sub_epi32(u[9], v[9]);
- u[10] = _mm_sub_epi32(u[10], v[10]);
- u[11] = _mm_sub_epi32(u[11], v[11]);
- u[12] = _mm_sub_epi32(u[12], v[12]);
- u[13] = _mm_sub_epi32(u[13], v[13]);
- u[14] = _mm_sub_epi32(u[14], v[14]);
- u[15] = _mm_sub_epi32(u[15], v[15]);
-
- v[0] = _mm_add_epi32(u[0], K32One);
- v[1] = _mm_add_epi32(u[1], K32One);
- v[2] = _mm_add_epi32(u[2], K32One);
- v[3] = _mm_add_epi32(u[3], K32One);
- v[4] = _mm_add_epi32(u[4], K32One);
- v[5] = _mm_add_epi32(u[5], K32One);
- v[6] = _mm_add_epi32(u[6], K32One);
- v[7] = _mm_add_epi32(u[7], K32One);
- v[8] = _mm_add_epi32(u[8], K32One);
- v[9] = _mm_add_epi32(u[9], K32One);
- v[10] = _mm_add_epi32(u[10], K32One);
- v[11] = _mm_add_epi32(u[11], K32One);
- v[12] = _mm_add_epi32(u[12], K32One);
- v[13] = _mm_add_epi32(u[13], K32One);
- v[14] = _mm_add_epi32(u[14], K32One);
- v[15] = _mm_add_epi32(u[15], K32One);
-
- u[0] = _mm_srai_epi32(v[0], 2);
- u[1] = _mm_srai_epi32(v[1], 2);
- u[2] = _mm_srai_epi32(v[2], 2);
- u[3] = _mm_srai_epi32(v[3], 2);
- u[4] = _mm_srai_epi32(v[4], 2);
- u[5] = _mm_srai_epi32(v[5], 2);
- u[6] = _mm_srai_epi32(v[6], 2);
- u[7] = _mm_srai_epi32(v[7], 2);
- u[8] = _mm_srai_epi32(v[8], 2);
- u[9] = _mm_srai_epi32(v[9], 2);
- u[10] = _mm_srai_epi32(v[10], 2);
- u[11] = _mm_srai_epi32(v[11], 2);
- u[12] = _mm_srai_epi32(v[12], 2);
- u[13] = _mm_srai_epi32(v[13], 2);
- u[14] = _mm_srai_epi32(v[14], 2);
- u[15] = _mm_srai_epi32(v[15], 2);
-
- out[2] = _mm_packs_epi32(u[0], u[1]);
- out[18] = _mm_packs_epi32(u[2], u[3]);
- out[10] = _mm_packs_epi32(u[4], u[5]);
- out[26] = _mm_packs_epi32(u[6], u[7]);
- out[6] = _mm_packs_epi32(u[8], u[9]);
- out[22] = _mm_packs_epi32(u[10], u[11]);
- out[14] = _mm_packs_epi32(u[12], u[13]);
- out[30] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
- &out[6], &out[22], &out[14], &out[30]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
- lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
- lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
- lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
- lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
- lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
- lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
- lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
- lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
- lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
- lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
- lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
- lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
- lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
- lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
- lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
- lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
- lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
- lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
- lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
- lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
- lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
- lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
- lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
- lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
- lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
- lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
- lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
- lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
- lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
- lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
- lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
- }
- // stage 8
- {
- const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
- const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
- const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
- const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
- const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
- const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
- const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
- const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
-
- u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
- u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
- u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
- u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
- u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
- u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
- u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
- u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
- u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
- u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
- u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
- u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
- u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
- u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
- u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
- u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
-
- v[0] = k_madd_epi32(u[0], k32_p31_p01);
- v[1] = k_madd_epi32(u[1], k32_p31_p01);
- v[2] = k_madd_epi32(u[2], k32_p31_p01);
- v[3] = k_madd_epi32(u[3], k32_p31_p01);
- v[4] = k_madd_epi32(u[4], k32_p15_p17);
- v[5] = k_madd_epi32(u[5], k32_p15_p17);
- v[6] = k_madd_epi32(u[6], k32_p15_p17);
- v[7] = k_madd_epi32(u[7], k32_p15_p17);
- v[8] = k_madd_epi32(u[8], k32_p23_p09);
- v[9] = k_madd_epi32(u[9], k32_p23_p09);
- v[10] = k_madd_epi32(u[10], k32_p23_p09);
- v[11] = k_madd_epi32(u[11], k32_p23_p09);
- v[12] = k_madd_epi32(u[12], k32_p07_p25);
- v[13] = k_madd_epi32(u[13], k32_p07_p25);
- v[14] = k_madd_epi32(u[14], k32_p07_p25);
- v[15] = k_madd_epi32(u[15], k32_p07_p25);
- v[16] = k_madd_epi32(u[12], k32_m25_p07);
- v[17] = k_madd_epi32(u[13], k32_m25_p07);
- v[18] = k_madd_epi32(u[14], k32_m25_p07);
- v[19] = k_madd_epi32(u[15], k32_m25_p07);
- v[20] = k_madd_epi32(u[8], k32_m09_p23);
- v[21] = k_madd_epi32(u[9], k32_m09_p23);
- v[22] = k_madd_epi32(u[10], k32_m09_p23);
- v[23] = k_madd_epi32(u[11], k32_m09_p23);
- v[24] = k_madd_epi32(u[4], k32_m17_p15);
- v[25] = k_madd_epi32(u[5], k32_m17_p15);
- v[26] = k_madd_epi32(u[6], k32_m17_p15);
- v[27] = k_madd_epi32(u[7], k32_m17_p15);
- v[28] = k_madd_epi32(u[0], k32_m01_p31);
- v[29] = k_madd_epi32(u[1], k32_m01_p31);
- v[30] = k_madd_epi32(u[2], k32_m01_p31);
- v[31] = k_madd_epi32(u[3], k32_m01_p31);
-
-#if DCT_HIGH_BIT_DEPTH
- overflow = k_check_epi32_overflow_32(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
- &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
- &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
- &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
- u[4] = k_packs_epi64(v[8], v[9]);
- u[5] = k_packs_epi64(v[10], v[11]);
- u[6] = k_packs_epi64(v[12], v[13]);
- u[7] = k_packs_epi64(v[14], v[15]);
- u[8] = k_packs_epi64(v[16], v[17]);
- u[9] = k_packs_epi64(v[18], v[19]);
- u[10] = k_packs_epi64(v[20], v[21]);
- u[11] = k_packs_epi64(v[22], v[23]);
- u[12] = k_packs_epi64(v[24], v[25]);
- u[13] = k_packs_epi64(v[26], v[27]);
- u[14] = k_packs_epi64(v[28], v[29]);
- u[15] = k_packs_epi64(v[30], v[31]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
- v[0] = _mm_cmplt_epi32(u[0], kZero);
- v[1] = _mm_cmplt_epi32(u[1], kZero);
- v[2] = _mm_cmplt_epi32(u[2], kZero);
- v[3] = _mm_cmplt_epi32(u[3], kZero);
- v[4] = _mm_cmplt_epi32(u[4], kZero);
- v[5] = _mm_cmplt_epi32(u[5], kZero);
- v[6] = _mm_cmplt_epi32(u[6], kZero);
- v[7] = _mm_cmplt_epi32(u[7], kZero);
- v[8] = _mm_cmplt_epi32(u[8], kZero);
- v[9] = _mm_cmplt_epi32(u[9], kZero);
- v[10] = _mm_cmplt_epi32(u[10], kZero);
- v[11] = _mm_cmplt_epi32(u[11], kZero);
- v[12] = _mm_cmplt_epi32(u[12], kZero);
- v[13] = _mm_cmplt_epi32(u[13], kZero);
- v[14] = _mm_cmplt_epi32(u[14], kZero);
- v[15] = _mm_cmplt_epi32(u[15], kZero);
-
- u[0] = _mm_sub_epi32(u[0], v[0]);
- u[1] = _mm_sub_epi32(u[1], v[1]);
- u[2] = _mm_sub_epi32(u[2], v[2]);
- u[3] = _mm_sub_epi32(u[3], v[3]);
- u[4] = _mm_sub_epi32(u[4], v[4]);
- u[5] = _mm_sub_epi32(u[5], v[5]);
- u[6] = _mm_sub_epi32(u[6], v[6]);
- u[7] = _mm_sub_epi32(u[7], v[7]);
- u[8] = _mm_sub_epi32(u[8], v[8]);
- u[9] = _mm_sub_epi32(u[9], v[9]);
- u[10] = _mm_sub_epi32(u[10], v[10]);
- u[11] = _mm_sub_epi32(u[11], v[11]);
- u[12] = _mm_sub_epi32(u[12], v[12]);
- u[13] = _mm_sub_epi32(u[13], v[13]);
- u[14] = _mm_sub_epi32(u[14], v[14]);
- u[15] = _mm_sub_epi32(u[15], v[15]);
-
- v[0] = _mm_add_epi32(u[0], K32One);
- v[1] = _mm_add_epi32(u[1], K32One);
- v[2] = _mm_add_epi32(u[2], K32One);
- v[3] = _mm_add_epi32(u[3], K32One);
- v[4] = _mm_add_epi32(u[4], K32One);
- v[5] = _mm_add_epi32(u[5], K32One);
- v[6] = _mm_add_epi32(u[6], K32One);
- v[7] = _mm_add_epi32(u[7], K32One);
- v[8] = _mm_add_epi32(u[8], K32One);
- v[9] = _mm_add_epi32(u[9], K32One);
- v[10] = _mm_add_epi32(u[10], K32One);
- v[11] = _mm_add_epi32(u[11], K32One);
- v[12] = _mm_add_epi32(u[12], K32One);
- v[13] = _mm_add_epi32(u[13], K32One);
- v[14] = _mm_add_epi32(u[14], K32One);
- v[15] = _mm_add_epi32(u[15], K32One);
-
- u[0] = _mm_srai_epi32(v[0], 2);
- u[1] = _mm_srai_epi32(v[1], 2);
- u[2] = _mm_srai_epi32(v[2], 2);
- u[3] = _mm_srai_epi32(v[3], 2);
- u[4] = _mm_srai_epi32(v[4], 2);
- u[5] = _mm_srai_epi32(v[5], 2);
- u[6] = _mm_srai_epi32(v[6], 2);
- u[7] = _mm_srai_epi32(v[7], 2);
- u[8] = _mm_srai_epi32(v[8], 2);
- u[9] = _mm_srai_epi32(v[9], 2);
- u[10] = _mm_srai_epi32(v[10], 2);
- u[11] = _mm_srai_epi32(v[11], 2);
- u[12] = _mm_srai_epi32(v[12], 2);
- u[13] = _mm_srai_epi32(v[13], 2);
- u[14] = _mm_srai_epi32(v[14], 2);
- u[15] = _mm_srai_epi32(v[15], 2);
-
- out[1] = _mm_packs_epi32(u[0], u[1]);
- out[17] = _mm_packs_epi32(u[2], u[3]);
- out[9] = _mm_packs_epi32(u[4], u[5]);
- out[25] = _mm_packs_epi32(u[6], u[7]);
- out[7] = _mm_packs_epi32(u[8], u[9]);
- out[23] = _mm_packs_epi32(u[10], u[11]);
- out[15] = _mm_packs_epi32(u[12], u[13]);
- out[31] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
- &out[7], &out[23], &out[15], &out[31]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
- const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
- const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
- const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
- const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
- const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
- const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
- const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
-
- u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
- u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
- u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
- u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
- u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
- u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
- u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
- u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
- u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
- u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
- u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
- u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
- u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
- u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
- u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
- u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
-
- v[0] = k_madd_epi32(u[0], k32_p27_p05);
- v[1] = k_madd_epi32(u[1], k32_p27_p05);
- v[2] = k_madd_epi32(u[2], k32_p27_p05);
- v[3] = k_madd_epi32(u[3], k32_p27_p05);
- v[4] = k_madd_epi32(u[4], k32_p11_p21);
- v[5] = k_madd_epi32(u[5], k32_p11_p21);
- v[6] = k_madd_epi32(u[6], k32_p11_p21);
- v[7] = k_madd_epi32(u[7], k32_p11_p21);
- v[8] = k_madd_epi32(u[8], k32_p19_p13);
- v[9] = k_madd_epi32(u[9], k32_p19_p13);
- v[10] = k_madd_epi32(u[10], k32_p19_p13);
- v[11] = k_madd_epi32(u[11], k32_p19_p13);
- v[12] = k_madd_epi32(u[12], k32_p03_p29);
- v[13] = k_madd_epi32(u[13], k32_p03_p29);
- v[14] = k_madd_epi32(u[14], k32_p03_p29);
- v[15] = k_madd_epi32(u[15], k32_p03_p29);
- v[16] = k_madd_epi32(u[12], k32_m29_p03);
- v[17] = k_madd_epi32(u[13], k32_m29_p03);
- v[18] = k_madd_epi32(u[14], k32_m29_p03);
- v[19] = k_madd_epi32(u[15], k32_m29_p03);
- v[20] = k_madd_epi32(u[8], k32_m13_p19);
- v[21] = k_madd_epi32(u[9], k32_m13_p19);
- v[22] = k_madd_epi32(u[10], k32_m13_p19);
- v[23] = k_madd_epi32(u[11], k32_m13_p19);
- v[24] = k_madd_epi32(u[4], k32_m21_p11);
- v[25] = k_madd_epi32(u[5], k32_m21_p11);
- v[26] = k_madd_epi32(u[6], k32_m21_p11);
- v[27] = k_madd_epi32(u[7], k32_m21_p11);
- v[28] = k_madd_epi32(u[0], k32_m05_p27);
- v[29] = k_madd_epi32(u[1], k32_m05_p27);
- v[30] = k_madd_epi32(u[2], k32_m05_p27);
- v[31] = k_madd_epi32(u[3], k32_m05_p27);
-
-#if DCT_HIGH_BIT_DEPTH
- overflow = k_check_epi32_overflow_32(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
- &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
- &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
- &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
- u[4] = k_packs_epi64(v[8], v[9]);
- u[5] = k_packs_epi64(v[10], v[11]);
- u[6] = k_packs_epi64(v[12], v[13]);
- u[7] = k_packs_epi64(v[14], v[15]);
- u[8] = k_packs_epi64(v[16], v[17]);
- u[9] = k_packs_epi64(v[18], v[19]);
- u[10] = k_packs_epi64(v[20], v[21]);
- u[11] = k_packs_epi64(v[22], v[23]);
- u[12] = k_packs_epi64(v[24], v[25]);
- u[13] = k_packs_epi64(v[26], v[27]);
- u[14] = k_packs_epi64(v[28], v[29]);
- u[15] = k_packs_epi64(v[30], v[31]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
- v[0] = _mm_cmplt_epi32(u[0], kZero);
- v[1] = _mm_cmplt_epi32(u[1], kZero);
- v[2] = _mm_cmplt_epi32(u[2], kZero);
- v[3] = _mm_cmplt_epi32(u[3], kZero);
- v[4] = _mm_cmplt_epi32(u[4], kZero);
- v[5] = _mm_cmplt_epi32(u[5], kZero);
- v[6] = _mm_cmplt_epi32(u[6], kZero);
- v[7] = _mm_cmplt_epi32(u[7], kZero);
- v[8] = _mm_cmplt_epi32(u[8], kZero);
- v[9] = _mm_cmplt_epi32(u[9], kZero);
- v[10] = _mm_cmplt_epi32(u[10], kZero);
- v[11] = _mm_cmplt_epi32(u[11], kZero);
- v[12] = _mm_cmplt_epi32(u[12], kZero);
- v[13] = _mm_cmplt_epi32(u[13], kZero);
- v[14] = _mm_cmplt_epi32(u[14], kZero);
- v[15] = _mm_cmplt_epi32(u[15], kZero);
-
- u[0] = _mm_sub_epi32(u[0], v[0]);
- u[1] = _mm_sub_epi32(u[1], v[1]);
- u[2] = _mm_sub_epi32(u[2], v[2]);
- u[3] = _mm_sub_epi32(u[3], v[3]);
- u[4] = _mm_sub_epi32(u[4], v[4]);
- u[5] = _mm_sub_epi32(u[5], v[5]);
- u[6] = _mm_sub_epi32(u[6], v[6]);
- u[7] = _mm_sub_epi32(u[7], v[7]);
- u[8] = _mm_sub_epi32(u[8], v[8]);
- u[9] = _mm_sub_epi32(u[9], v[9]);
- u[10] = _mm_sub_epi32(u[10], v[10]);
- u[11] = _mm_sub_epi32(u[11], v[11]);
- u[12] = _mm_sub_epi32(u[12], v[12]);
- u[13] = _mm_sub_epi32(u[13], v[13]);
- u[14] = _mm_sub_epi32(u[14], v[14]);
- u[15] = _mm_sub_epi32(u[15], v[15]);
-
- v[0] = _mm_add_epi32(u[0], K32One);
- v[1] = _mm_add_epi32(u[1], K32One);
- v[2] = _mm_add_epi32(u[2], K32One);
- v[3] = _mm_add_epi32(u[3], K32One);
- v[4] = _mm_add_epi32(u[4], K32One);
- v[5] = _mm_add_epi32(u[5], K32One);
- v[6] = _mm_add_epi32(u[6], K32One);
- v[7] = _mm_add_epi32(u[7], K32One);
- v[8] = _mm_add_epi32(u[8], K32One);
- v[9] = _mm_add_epi32(u[9], K32One);
- v[10] = _mm_add_epi32(u[10], K32One);
- v[11] = _mm_add_epi32(u[11], K32One);
- v[12] = _mm_add_epi32(u[12], K32One);
- v[13] = _mm_add_epi32(u[13], K32One);
- v[14] = _mm_add_epi32(u[14], K32One);
- v[15] = _mm_add_epi32(u[15], K32One);
-
- u[0] = _mm_srai_epi32(v[0], 2);
- u[1] = _mm_srai_epi32(v[1], 2);
- u[2] = _mm_srai_epi32(v[2], 2);
- u[3] = _mm_srai_epi32(v[3], 2);
- u[4] = _mm_srai_epi32(v[4], 2);
- u[5] = _mm_srai_epi32(v[5], 2);
- u[6] = _mm_srai_epi32(v[6], 2);
- u[7] = _mm_srai_epi32(v[7], 2);
- u[8] = _mm_srai_epi32(v[8], 2);
- u[9] = _mm_srai_epi32(v[9], 2);
- u[10] = _mm_srai_epi32(v[10], 2);
- u[11] = _mm_srai_epi32(v[11], 2);
- u[12] = _mm_srai_epi32(v[12], 2);
- u[13] = _mm_srai_epi32(v[13], 2);
- u[14] = _mm_srai_epi32(v[14], 2);
- u[15] = _mm_srai_epi32(v[15], 2);
-
- out[5] = _mm_packs_epi32(u[0], u[1]);
- out[21] = _mm_packs_epi32(u[2], u[3]);
- out[13] = _mm_packs_epi32(u[4], u[5]);
- out[29] = _mm_packs_epi32(u[6], u[7]);
- out[3] = _mm_packs_epi32(u[8], u[9]);
- out[19] = _mm_packs_epi32(u[10], u[11]);
- out[11] = _mm_packs_epi32(u[12], u[13]);
- out[27] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
- &out[3], &out[19], &out[11], &out[27]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- }
-#endif // FDCT32x32_HIGH_PRECISION
- // Transpose the results, do it as four 8x8 transposes.
- {
- int transpose_block;
- int16_t *output0 = &intermediate[column_start * 32];
- tran_low_t *output1 = &output_org[column_start * 32];
- for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
- __m128i *this_out = &out[8 * transpose_block];
- // 00 01 02 03 04 05 06 07
- // 10 11 12 13 14 15 16 17
- // 20 21 22 23 24 25 26 27
- // 30 31 32 33 34 35 36 37
- // 40 41 42 43 44 45 46 47
- // 50 51 52 53 54 55 56 57
- // 60 61 62 63 64 65 66 67
- // 70 71 72 73 74 75 76 77
- const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
- const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
- const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
- const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
- const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 54 54 55 55 56 56 57 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 21 36
- // 44 54 64 74 45 55 61 76
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
- __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
- __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
- __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
- __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
- __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
- __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
- __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- if (0 == pass) {
- // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
- // TODO(cd): see quality impact of only doing
- // output[j] = (output[j] + 1) >> 2;
- // which would remove the code between here ...
- __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
- __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
- __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
- __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
- __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
- __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
- __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
- __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
- tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
- tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
- tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
- tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
- tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
- tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
- tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
- tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
- // ... and here.
- // PS: also change code in av1/encoder/dct.c
- tr2_0 = _mm_add_epi16(tr2_0, kOne);
- tr2_1 = _mm_add_epi16(tr2_1, kOne);
- tr2_2 = _mm_add_epi16(tr2_2, kOne);
- tr2_3 = _mm_add_epi16(tr2_3, kOne);
- tr2_4 = _mm_add_epi16(tr2_4, kOne);
- tr2_5 = _mm_add_epi16(tr2_5, kOne);
- tr2_6 = _mm_add_epi16(tr2_6, kOne);
- tr2_7 = _mm_add_epi16(tr2_7, kOne);
- tr2_0 = _mm_srai_epi16(tr2_0, 2);
- tr2_1 = _mm_srai_epi16(tr2_1, 2);
- tr2_2 = _mm_srai_epi16(tr2_2, 2);
- tr2_3 = _mm_srai_epi16(tr2_3, 2);
- tr2_4 = _mm_srai_epi16(tr2_4, 2);
- tr2_5 = _mm_srai_epi16(tr2_5, 2);
- tr2_6 = _mm_srai_epi16(tr2_6, 2);
- tr2_7 = _mm_srai_epi16(tr2_7, 2);
- }
- // Note: even though all these stores are aligned, using the aligned
- // intrinsic make the code slightly slower.
- if (pass == 0) {
- _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
- _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
- _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
- _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
- _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
- _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
- _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
- _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
- // Process next 8x8
- output0 += 8;
- } else {
- storeu_output(&tr2_0, (output1 + 0 * 32));
- storeu_output(&tr2_1, (output1 + 1 * 32));
- storeu_output(&tr2_2, (output1 + 2 * 32));
- storeu_output(&tr2_3, (output1 + 3 * 32));
- storeu_output(&tr2_4, (output1 + 4 * 32));
- storeu_output(&tr2_5, (output1 + 5 * 32));
- storeu_output(&tr2_6, (output1 + 6 * 32));
- storeu_output(&tr2_7, (output1 + 7 * 32));
- // Process next 8x8
- output1 += 8;
- }
- }
- }
- }
- }
-} // NOLINT
-
-#undef ADD_EPI16
-#undef SUB_EPI16
-#undef HIGH_FDCT32x32_2D_C
-#undef HIGH_FDCT32x32_2D_ROWS_C
diff --git a/av1/common/x86/av1_fwd_txfm1d_sse4.c b/av1/common/x86/av1_fwd_txfm1d_sse4.c
index f0bcef9..c09a019 100644
--- a/av1/common/x86/av1_fwd_txfm1d_sse4.c
+++ b/av1/common/x86/av1_fwd_txfm1d_sse4.c
@@ -1,354 +1,5 @@
#include "av1/common/x86/av1_txfm1d_sse4.h"
-void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range) {
- const int txfm_size = 4;
- const int num_per_128 = 4;
- const int32_t *cospi;
- __m128i buf0[4];
- __m128i buf1[4];
- int col_num = txfm_size / num_per_128;
- int bit;
- int col;
- (void)stage_range;
- for (col = 0; col < col_num; col++) {
- // stage 0;
- int32_t stage_idx = 0;
- buf0[0] = input[0 * col_num + col];
- buf0[1] = input[1 * col_num + col];
- buf0[2] = input[2 * col_num + col];
- buf0[3] = input[3 * col_num + col];
-
- // stage 1
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = _mm_add_epi32(buf0[0], buf0[3]);
- buf1[3] = _mm_sub_epi32(buf0[0], buf0[3]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[2]);
- buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]);
-
- // stage 2
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0],
- buf0[1], bit);
- btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2],
- buf0[3], bit);
-
- // stage 3
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = buf0[0];
- buf1[1] = buf0[2];
- buf1[2] = buf0[1];
- buf1[3] = buf0[3];
-
- output[0 * col_num + col] = buf1[0];
- output[1 * col_num + col] = buf1[1];
- output[2 * col_num + col] = buf1[2];
- output[3 * col_num + col] = buf1[3];
- }
-}
-
-void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range) {
- const int txfm_size = 8;
- const int num_per_128 = 4;
- const int32_t *cospi;
- __m128i buf0[8];
- __m128i buf1[8];
- int col_num = txfm_size / num_per_128;
- int bit;
- int col;
- (void)stage_range;
- for (col = 0; col < col_num; col++) {
- // stage 0;
- int32_t stage_idx = 0;
- buf0[0] = input[0 * col_num + col];
- buf0[1] = input[1 * col_num + col];
- buf0[2] = input[2 * col_num + col];
- buf0[3] = input[3 * col_num + col];
- buf0[4] = input[4 * col_num + col];
- buf0[5] = input[5 * col_num + col];
- buf0[6] = input[6 * col_num + col];
- buf0[7] = input[7 * col_num + col];
-
- // stage 1
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
- buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
- buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
- buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
- buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
- buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
- buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
-
- // stage 2
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
- buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
- buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
- buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
- buf0[4] = buf1[4];
- btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
- buf0[6], bit);
- buf0[7] = buf1[7];
-
- // stage 3
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
- buf1[1], bit);
- btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
- buf1[3], bit);
- buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
- buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
- buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
- buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
-
- // stage 4
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- buf0[2] = buf1[2];
- buf0[3] = buf1[3];
- btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
- bit);
- btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
- buf0[6], bit);
-
- // stage 5
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = buf0[0];
- buf1[1] = buf0[4];
- buf1[2] = buf0[2];
- buf1[3] = buf0[6];
- buf1[4] = buf0[1];
- buf1[5] = buf0[5];
- buf1[6] = buf0[3];
- buf1[7] = buf0[7];
-
- output[0 * col_num + col] = buf1[0];
- output[1 * col_num + col] = buf1[1];
- output[2 * col_num + col] = buf1[2];
- output[3 * col_num + col] = buf1[3];
- output[4 * col_num + col] = buf1[4];
- output[5 * col_num + col] = buf1[5];
- output[6 * col_num + col] = buf1[6];
- output[7 * col_num + col] = buf1[7];
- }
-}
-
-void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range) {
- const int txfm_size = 16;
- const int num_per_128 = 4;
- const int32_t *cospi;
- __m128i buf0[16];
- __m128i buf1[16];
- int col_num = txfm_size / num_per_128;
- int bit;
- int col;
- (void)stage_range;
- for (col = 0; col < col_num; col++) {
- // stage 0;
- int32_t stage_idx = 0;
- buf0[0] = input[0 * col_num + col];
- buf0[1] = input[1 * col_num + col];
- buf0[2] = input[2 * col_num + col];
- buf0[3] = input[3 * col_num + col];
- buf0[4] = input[4 * col_num + col];
- buf0[5] = input[5 * col_num + col];
- buf0[6] = input[6 * col_num + col];
- buf0[7] = input[7 * col_num + col];
- buf0[8] = input[8 * col_num + col];
- buf0[9] = input[9 * col_num + col];
- buf0[10] = input[10 * col_num + col];
- buf0[11] = input[11 * col_num + col];
- buf0[12] = input[12 * col_num + col];
- buf0[13] = input[13 * col_num + col];
- buf0[14] = input[14 * col_num + col];
- buf0[15] = input[15 * col_num + col];
-
- // stage 1
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = _mm_add_epi32(buf0[0], buf0[15]);
- buf1[15] = _mm_sub_epi32(buf0[0], buf0[15]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[14]);
- buf1[14] = _mm_sub_epi32(buf0[1], buf0[14]);
- buf1[2] = _mm_add_epi32(buf0[2], buf0[13]);
- buf1[13] = _mm_sub_epi32(buf0[2], buf0[13]);
- buf1[3] = _mm_add_epi32(buf0[3], buf0[12]);
- buf1[12] = _mm_sub_epi32(buf0[3], buf0[12]);
- buf1[4] = _mm_add_epi32(buf0[4], buf0[11]);
- buf1[11] = _mm_sub_epi32(buf0[4], buf0[11]);
- buf1[5] = _mm_add_epi32(buf0[5], buf0[10]);
- buf1[10] = _mm_sub_epi32(buf0[5], buf0[10]);
- buf1[6] = _mm_add_epi32(buf0[6], buf0[9]);
- buf1[9] = _mm_sub_epi32(buf0[6], buf0[9]);
- buf1[7] = _mm_add_epi32(buf0[7], buf0[8]);
- buf1[8] = _mm_sub_epi32(buf0[7], buf0[8]);
-
- // stage 2
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf0[0] = _mm_add_epi32(buf1[0], buf1[7]);
- buf0[7] = _mm_sub_epi32(buf1[0], buf1[7]);
- buf0[1] = _mm_add_epi32(buf1[1], buf1[6]);
- buf0[6] = _mm_sub_epi32(buf1[1], buf1[6]);
- buf0[2] = _mm_add_epi32(buf1[2], buf1[5]);
- buf0[5] = _mm_sub_epi32(buf1[2], buf1[5]);
- buf0[3] = _mm_add_epi32(buf1[3], buf1[4]);
- buf0[4] = _mm_sub_epi32(buf1[3], buf1[4]);
- buf0[8] = buf1[8];
- buf0[9] = buf1[9];
- btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10],
- buf0[13], bit);
- btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11],
- buf0[12], bit);
- buf0[14] = buf1[14];
- buf0[15] = buf1[15];
-
- // stage 3
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = _mm_add_epi32(buf0[0], buf0[3]);
- buf1[3] = _mm_sub_epi32(buf0[0], buf0[3]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[2]);
- buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]);
- buf1[4] = buf0[4];
- btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5],
- buf1[6], bit);
- buf1[7] = buf0[7];
- buf1[8] = _mm_add_epi32(buf0[8], buf0[11]);
- buf1[11] = _mm_sub_epi32(buf0[8], buf0[11]);
- buf1[9] = _mm_add_epi32(buf0[9], buf0[10]);
- buf1[10] = _mm_sub_epi32(buf0[9], buf0[10]);
- buf1[12] = _mm_sub_epi32(buf0[15], buf0[12]);
- buf1[15] = _mm_add_epi32(buf0[15], buf0[12]);
- buf1[13] = _mm_sub_epi32(buf0[14], buf0[13]);
- buf1[14] = _mm_add_epi32(buf0[14], buf0[13]);
-
- // stage 4
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0],
- buf0[1], bit);
- btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2],
- buf0[3], bit);
- buf0[4] = _mm_add_epi32(buf1[4], buf1[5]);
- buf0[5] = _mm_sub_epi32(buf1[4], buf1[5]);
- buf0[6] = _mm_sub_epi32(buf1[7], buf1[6]);
- buf0[7] = _mm_add_epi32(buf1[7], buf1[6]);
- buf0[8] = buf1[8];
- btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9],
- buf0[14], bit);
- btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10],
- buf0[13], bit);
- buf0[11] = buf1[11];
- buf0[12] = buf1[12];
- buf0[15] = buf1[15];
-
- // stage 5
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = buf0[0];
- buf1[1] = buf0[1];
- buf1[2] = buf0[2];
- buf1[3] = buf0[3];
- btf_32_sse4_1_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7],
- bit);
- btf_32_sse4_1_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5],
- buf1[6], bit);
- buf1[8] = _mm_add_epi32(buf0[8], buf0[9]);
- buf1[9] = _mm_sub_epi32(buf0[8], buf0[9]);
- buf1[10] = _mm_sub_epi32(buf0[11], buf0[10]);
- buf1[11] = _mm_add_epi32(buf0[11], buf0[10]);
- buf1[12] = _mm_add_epi32(buf0[12], buf0[13]);
- buf1[13] = _mm_sub_epi32(buf0[12], buf0[13]);
- buf1[14] = _mm_sub_epi32(buf0[15], buf0[14]);
- buf1[15] = _mm_add_epi32(buf0[15], buf0[14]);
-
- // stage 6
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- buf0[2] = buf1[2];
- buf0[3] = buf1[3];
- buf0[4] = buf1[4];
- buf0[5] = buf1[5];
- buf0[6] = buf1[6];
- buf0[7] = buf1[7];
- btf_32_sse4_1_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8],
- buf0[15], bit);
- btf_32_sse4_1_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9],
- buf0[14], bit);
- btf_32_sse4_1_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10],
- buf0[13], bit);
- btf_32_sse4_1_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11],
- buf0[12], bit);
-
- // stage 7
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = buf0[0];
- buf1[1] = buf0[8];
- buf1[2] = buf0[4];
- buf1[3] = buf0[12];
- buf1[4] = buf0[2];
- buf1[5] = buf0[10];
- buf1[6] = buf0[6];
- buf1[7] = buf0[14];
- buf1[8] = buf0[1];
- buf1[9] = buf0[9];
- buf1[10] = buf0[5];
- buf1[11] = buf0[13];
- buf1[12] = buf0[3];
- buf1[13] = buf0[11];
- buf1[14] = buf0[7];
- buf1[15] = buf0[15];
-
- output[0 * col_num + col] = buf1[0];
- output[1 * col_num + col] = buf1[1];
- output[2 * col_num + col] = buf1[2];
- output[3 * col_num + col] = buf1[3];
- output[4 * col_num + col] = buf1[4];
- output[5 * col_num + col] = buf1[5];
- output[6 * col_num + col] = buf1[6];
- output[7 * col_num + col] = buf1[7];
- output[8 * col_num + col] = buf1[8];
- output[9 * col_num + col] = buf1[9];
- output[10 * col_num + col] = buf1[10];
- output[11 * col_num + col] = buf1[11];
- output[12 * col_num + col] = buf1[12];
- output[13 * col_num + col] = buf1[13];
- output[14 * col_num + col] = buf1[14];
- output[15 * col_num + col] = buf1[15];
- }
-}
-
void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
const int8_t *cos_bit, const int8_t *stage_range) {
const int txfm_size = 32;
@@ -835,370 +486,6 @@
}
}
-void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range) {
- const int txfm_size = 8;
- const int num_per_128 = 4;
- const int32_t *cospi;
- __m128i buf0[8];
- __m128i buf1[8];
- int col_num = txfm_size / num_per_128;
- int bit;
- int col;
- (void)stage_range;
- for (col = 0; col < col_num; col++) {
- // stage 0;
- int32_t stage_idx = 0;
- buf0[0] = input[0 * col_num + col];
- buf0[1] = input[1 * col_num + col];
- buf0[2] = input[2 * col_num + col];
- buf0[3] = input[3 * col_num + col];
- buf0[4] = input[4 * col_num + col];
- buf0[5] = input[5 * col_num + col];
- buf0[6] = input[6 * col_num + col];
- buf0[7] = input[7 * col_num + col];
-
- // stage 1
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = buf0[7];
- buf1[1] = buf0[0];
- buf1[2] = buf0[5];
- buf1[3] = buf0[2];
- buf1[4] = buf0[3];
- buf1[5] = buf0[4];
- buf1[6] = buf0[1];
- buf1[7] = buf0[6];
-
- // stage 2
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[0], buf1[1], buf0[0], buf0[1],
- bit);
- btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[2], buf1[3], buf0[2],
- buf0[3], bit);
- btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[4], buf1[5], buf0[4],
- buf0[5], bit);
- btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[6], buf1[7], buf0[6],
- buf0[7], bit);
-
- // stage 3
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
- buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
- buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
- buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
- buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
- buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
- buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
-
- // stage 4
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- buf0[2] = buf1[2];
- buf0[3] = buf1[3];
- btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
- buf0[5], bit);
- btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
- buf0[7], bit);
-
- // stage 5
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
- buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
- buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
- buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
- buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
- buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
- buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
-
- // stage 6
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
- buf0[3], bit);
- buf0[4] = buf1[4];
- buf0[5] = buf1[5];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
- buf0[7], bit);
-
- // stage 7
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = buf0[0];
- buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
- buf1[2] = buf0[6];
- buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
- buf1[4] = buf0[3];
- buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
- buf1[6] = buf0[5];
- buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
-
- output[0 * col_num + col] = buf1[0];
- output[1 * col_num + col] = buf1[1];
- output[2 * col_num + col] = buf1[2];
- output[3 * col_num + col] = buf1[3];
- output[4 * col_num + col] = buf1[4];
- output[5 * col_num + col] = buf1[5];
- output[6 * col_num + col] = buf1[6];
- output[7 * col_num + col] = buf1[7];
- }
-}
-
-void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range) {
- const int txfm_size = 16;
- const int num_per_128 = 4;
- const int32_t *cospi;
- __m128i buf0[16];
- __m128i buf1[16];
- int col_num = txfm_size / num_per_128;
- int bit;
- int col;
- (void)stage_range;
- for (col = 0; col < col_num; col++) {
- // stage 0;
- int32_t stage_idx = 0;
- buf0[0] = input[0 * col_num + col];
- buf0[1] = input[1 * col_num + col];
- buf0[2] = input[2 * col_num + col];
- buf0[3] = input[3 * col_num + col];
- buf0[4] = input[4 * col_num + col];
- buf0[5] = input[5 * col_num + col];
- buf0[6] = input[6 * col_num + col];
- buf0[7] = input[7 * col_num + col];
- buf0[8] = input[8 * col_num + col];
- buf0[9] = input[9 * col_num + col];
- buf0[10] = input[10 * col_num + col];
- buf0[11] = input[11 * col_num + col];
- buf0[12] = input[12 * col_num + col];
- buf0[13] = input[13 * col_num + col];
- buf0[14] = input[14 * col_num + col];
- buf0[15] = input[15 * col_num + col];
-
- // stage 1
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = buf0[15];
- buf1[1] = buf0[0];
- buf1[2] = buf0[13];
- buf1[3] = buf0[2];
- buf1[4] = buf0[11];
- buf1[5] = buf0[4];
- buf1[6] = buf0[9];
- buf1[7] = buf0[6];
- buf1[8] = buf0[7];
- buf1[9] = buf0[8];
- buf1[10] = buf0[5];
- buf1[11] = buf0[10];
- buf1[12] = buf0[3];
- buf1[13] = buf0[12];
- buf1[14] = buf0[1];
- buf1[15] = buf0[14];
-
- // stage 2
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- btf_32_sse4_1_type0(cospi[2], cospi[62], buf1[0], buf1[1], buf0[0], buf0[1],
- bit);
- btf_32_sse4_1_type0(cospi[10], cospi[54], buf1[2], buf1[3], buf0[2],
- buf0[3], bit);
- btf_32_sse4_1_type0(cospi[18], cospi[46], buf1[4], buf1[5], buf0[4],
- buf0[5], bit);
- btf_32_sse4_1_type0(cospi[26], cospi[38], buf1[6], buf1[7], buf0[6],
- buf0[7], bit);
- btf_32_sse4_1_type0(cospi[34], cospi[30], buf1[8], buf1[9], buf0[8],
- buf0[9], bit);
- btf_32_sse4_1_type0(cospi[42], cospi[22], buf1[10], buf1[11], buf0[10],
- buf0[11], bit);
- btf_32_sse4_1_type0(cospi[50], cospi[14], buf1[12], buf1[13], buf0[12],
- buf0[13], bit);
- btf_32_sse4_1_type0(cospi[58], cospi[6], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
-
- // stage 3
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = _mm_add_epi32(buf0[0], buf0[8]);
- buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[9]);
- buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]);
- buf1[2] = _mm_add_epi32(buf0[2], buf0[10]);
- buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]);
- buf1[3] = _mm_add_epi32(buf0[3], buf0[11]);
- buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]);
- buf1[4] = _mm_add_epi32(buf0[4], buf0[12]);
- buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]);
- buf1[5] = _mm_add_epi32(buf0[5], buf0[13]);
- buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]);
- buf1[6] = _mm_add_epi32(buf0[6], buf0[14]);
- buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]);
- buf1[7] = _mm_add_epi32(buf0[7], buf0[15]);
- buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]);
-
- // stage 4
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- buf0[2] = buf1[2];
- buf0[3] = buf1[3];
- buf0[4] = buf1[4];
- buf0[5] = buf1[5];
- buf0[6] = buf1[6];
- buf0[7] = buf1[7];
- btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
- bit);
- btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
- buf0[11], bit);
- btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
- buf0[13], bit);
- btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
-
- // stage 5
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
- buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
- buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
- buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
- buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
- buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
- buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
- buf1[8] = _mm_add_epi32(buf0[8], buf0[12]);
- buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]);
- buf1[9] = _mm_add_epi32(buf0[9], buf0[13]);
- buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]);
- buf1[10] = _mm_add_epi32(buf0[10], buf0[14]);
- buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]);
- buf1[11] = _mm_add_epi32(buf0[11], buf0[15]);
- buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]);
-
- // stage 6
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- buf0[2] = buf1[2];
- buf0[3] = buf1[3];
- btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
- buf0[5], bit);
- btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
- buf0[7], bit);
- buf0[8] = buf1[8];
- buf0[9] = buf1[9];
- buf0[10] = buf1[10];
- buf0[11] = buf1[11];
- btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
- buf0[13], bit);
- btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
-
- // stage 7
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
- buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
- buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
- buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
- buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
- buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
- buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
- buf1[8] = _mm_add_epi32(buf0[8], buf0[10]);
- buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]);
- buf1[9] = _mm_add_epi32(buf0[9], buf0[11]);
- buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]);
- buf1[12] = _mm_add_epi32(buf0[12], buf0[14]);
- buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]);
- buf1[13] = _mm_add_epi32(buf0[13], buf0[15]);
- buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]);
-
- // stage 8
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
- buf0[3], bit);
- buf0[4] = buf1[4];
- buf0[5] = buf1[5];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
- buf0[7], bit);
- buf0[8] = buf1[8];
- buf0[9] = buf1[9];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
- buf0[11], bit);
- buf0[12] = buf1[12];
- buf0[13] = buf1[13];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
-
- // stage 9
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = buf0[0];
- buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]);
- buf1[2] = buf0[12];
- buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
- buf1[4] = buf0[6];
- buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]);
- buf1[6] = buf0[10];
- buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
- buf1[8] = buf0[3];
- buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]);
- buf1[10] = buf0[15];
- buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
- buf1[12] = buf0[5];
- buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]);
- buf1[14] = buf0[9];
- buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
-
- output[0 * col_num + col] = buf1[0];
- output[1 * col_num + col] = buf1[1];
- output[2 * col_num + col] = buf1[2];
- output[3 * col_num + col] = buf1[3];
- output[4 * col_num + col] = buf1[4];
- output[5 * col_num + col] = buf1[5];
- output[6 * col_num + col] = buf1[6];
- output[7 * col_num + col] = buf1[7];
- output[8 * col_num + col] = buf1[8];
- output[9 * col_num + col] = buf1[9];
- output[10 * col_num + col] = buf1[10];
- output[11 * col_num + col] = buf1[11];
- output[12 * col_num + col] = buf1[12];
- output[13 * col_num + col] = buf1[13];
- output[14 * col_num + col] = buf1[14];
- output[15 * col_num + col] = buf1[15];
- }
-}
-
void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
const int8_t *cos_bit, const int8_t *stage_range) {
const int txfm_size = 32;
diff --git a/av1/common/x86/av1_fwd_txfm2d_sse4.c b/av1/common/x86/av1_fwd_txfm2d_sse4.c
index 07c283e..3d60b36 100644
--- a/av1/common/x86/av1_fwd_txfm2d_sse4.c
+++ b/av1/common/x86/av1_fwd_txfm2d_sse4.c
@@ -28,13 +28,7 @@
static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
switch (txfm_type) {
- case TXFM_TYPE_DCT4: return av1_fdct4_new_sse4_1; break;
- case TXFM_TYPE_DCT8: return av1_fdct8_new_sse4_1; break;
- case TXFM_TYPE_DCT16: return av1_fdct16_new_sse4_1; break;
case TXFM_TYPE_DCT32: return av1_fdct32_new_sse4_1; break;
- case TXFM_TYPE_ADST4: return av1_fadst4_new_sse4_1; break;
- case TXFM_TYPE_ADST8: return av1_fadst8_new_sse4_1; break;
- case TXFM_TYPE_ADST16: return av1_fadst16_new_sse4_1; break;
case TXFM_TYPE_ADST32: return av1_fadst32_new_sse4_1; break;
default: assert(0);
}
diff --git a/av1/common/x86/av1_fwd_txfm_impl_sse2.h b/av1/common/x86/av1_fwd_txfm_impl_sse2.h
deleted file mode 100644
index 0e341ac..0000000
--- a/av1/common/x86/av1_fwd_txfm_impl_sse2.h
+++ /dev/null
@@ -1,1014 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h> // SSE2
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/fwd_txfm_sse2.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-#include "aom_ports/mem.h"
-
-// TODO(jingning) The high bit-depth functions need rework for performance.
-// After we properly fix the high bit-depth function implementations, this
-// file's dependency should be substantially simplified.
-#if DCT_HIGH_BIT_DEPTH
-#define ADD_EPI16 _mm_adds_epi16
-#define SUB_EPI16 _mm_subs_epi16
-
-#else
-#define ADD_EPI16 _mm_add_epi16
-#define SUB_EPI16 _mm_sub_epi16
-#endif
-
-void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
- // This 2D transform implements 4 vertical 1D transforms followed
- // by 4 horizontal 1D transforms. The multiplies and adds are as given
- // by Chen, Smith and Fralick ('77). The commands for moving the data
- // around have been minimized by hand.
- // For the purposes of the comments, the 16 inputs are referred to at i0
- // through iF (in raster order), intermediate variables are a0, b0, c0
- // through f, and correspond to the in-place computations mapped to input
- // locations. The outputs, o0 through oF are labeled according to the
- // output locations.
-
- // Constants
- // These are the coefficients used for the multiplies.
- // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
- // where cospi_N_64 = cos(N pi /64)
- const __m128i k__cospi_A =
- octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
- cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_B =
- octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
- cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
- const __m128i k__cospi_C =
- octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
- cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_D =
- octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
- cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64);
- const __m128i k__cospi_E =
- octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
- cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
- const __m128i k__cospi_F =
- octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
- cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_G =
- octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
- -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64);
- const __m128i k__cospi_H =
- octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
- -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64);
-
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- // This second rounding constant saves doing some extra adds at the end
- const __m128i k__DCT_CONST_ROUNDING2 =
- _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1));
- const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
- const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
- const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
- __m128i in0, in1;
-#if DCT_HIGH_BIT_DEPTH
- __m128i cmp0, cmp1;
- int test, overflow;
-#endif
-
- // Load inputs.
- in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
- in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
- in1 = _mm_unpacklo_epi64(
- in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
- in0 = _mm_unpacklo_epi64(
- in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
-// in0 = [i0 i1 i2 i3 iC iD iE iF]
-// in1 = [i4 i5 i6 i7 i8 i9 iA iB]
-#if DCT_HIGH_BIT_DEPTH
- // Check inputs small enough to use optimised code
- cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
- _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00)));
- cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)),
- _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00)));
- test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1));
- if (test) {
- aom_highbd_fdct4x4_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
-
- // multiply by 16 to give some extra precision
- in0 = _mm_slli_epi16(in0, 4);
- in1 = _mm_slli_epi16(in1, 4);
- // if (i == 0 && input[0]) input[0] += 1;
- // add 1 to the upper left pixel if it is non-zero, which helps reduce
- // the round-trip error
- {
- // The mask will only contain whether the first value is zero, all
- // other comparison will fail as something shifted by 4 (above << 4)
- // can never be equal to one. To increment in the non-zero case, we
- // add the mask and one for the first element:
- // - if zero, mask = -1, v = v - 1 + 1 = v
- // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
- __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
- in0 = _mm_add_epi16(in0, mask);
- in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
- }
- // There are 4 total stages, alternating between an add/subtract stage
- // followed by an multiply-and-add stage.
- {
- // Stage 1: Add/subtract
-
- // in0 = [i0 i1 i2 i3 iC iD iE iF]
- // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
- const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
- const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
- // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
- // r1 = [iC i8 iD i9 iE iA iF iB]
- const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
- const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
- // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
- // r3 = [iC i8 iD i9 iF iB iE iA]
-
- const __m128i t0 = _mm_add_epi16(r2, r3);
- const __m128i t1 = _mm_sub_epi16(r2, r3);
- // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
- // t1 = [aC a8 aD a9 aF aB aE aA]
-
- // Stage 2: multiply by constants (which gets us into 32 bits).
- // The constants needed here are:
- // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
- // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
- // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
- // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
- // Then add and right-shift to get back to 16-bit range
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // w0 = [b0 b1 b7 b6]
- // w1 = [b8 b9 bF bE]
- // w2 = [b4 b5 b3 b2]
- // w3 = [bC bD bB bA]
- const __m128i x0 = _mm_packs_epi32(w0, w1);
- const __m128i x1 = _mm_packs_epi32(w2, w3);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x2(&x0, &x1);
- if (overflow) {
- aom_highbd_fdct4x4_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
- // x1 = [b4 b5 b3 b2 bC bD bB bA]
- in0 = _mm_shuffle_epi32(x0, 0xD8);
- in1 = _mm_shuffle_epi32(x1, 0x8D);
- // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
- // in1 = [b3 b2 bB bA b4 b5 bC bD]
- }
- {
- // vertical DCTs finished. Now we do the horizontal DCTs.
- // Stage 3: Add/subtract
-
- // t0 = [c0 c1 c8 c9 c4 c5 cC cD]
- // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
- const __m128i t0 = ADD_EPI16(in0, in1);
- const __m128i t1 = SUB_EPI16(in0, in1);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x2(&t0, &t1);
- if (overflow) {
- aom_highbd_fdct4x4_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
-
- // Stage 4: multiply by constants (which gets us into 32 bits).
- {
- // The constants needed here are:
- // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
- // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
- // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
- // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
- const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
- const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
- // Then add and right-shift to get back to 16-bit range
- // but this combines the final right-shift as well to save operations
- // This unusual rounding operations is to maintain bit-accurate
- // compatibility with the c version of this function which has two
- // rounding steps in a row.
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
- // w0 = [o0 o4 o8 oC]
- // w1 = [o2 o6 oA oE]
- // w2 = [o1 o5 o9 oD]
- // w3 = [o3 o7 oB oF]
- // remember the o's are numbered according to the correct output location
- const __m128i x0 = _mm_packs_epi32(w0, w1);
- const __m128i x1 = _mm_packs_epi32(w2, w3);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x2(&x0, &x1);
- if (overflow) {
- aom_highbd_fdct4x4_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- {
- // x0 = [o0 o4 o8 oC o2 o6 oA oE]
- // x1 = [o1 o5 o9 oD o3 o7 oB oF]
- const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
- const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
- // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
- // y1 = [o2 o3 o6 o7 oA oB oE oF]
- in0 = _mm_unpacklo_epi32(y0, y1);
- // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
- in1 = _mm_unpackhi_epi32(y0, y1);
- // in1 = [o8 o9 oA oB oC oD oE oF]
- }
- }
- }
- // Post-condition (v + 1) >> 2 is now incorporated into previous
- // add and right-shift commands. Only 2 store instructions needed
- // because we are using the fact that 1/3 are stored just after 0/2.
- storeu_output(&in0, output + 0 * 4);
- storeu_output(&in1, output + 2 * 4);
-}
-
-void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
- int pass;
- // Constants
- // When we use them, in one case, they are all the same. In all others
- // it's a pair of them that we need to repeat four times. This is done
- // by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-#if DCT_HIGH_BIT_DEPTH
- int overflow;
-#endif
- // Load input
- __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
- __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
- __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
- __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
- __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
- __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
- __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
- __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
- // Pre-condition input (shift by two)
- in0 = _mm_slli_epi16(in0, 2);
- in1 = _mm_slli_epi16(in1, 2);
- in2 = _mm_slli_epi16(in2, 2);
- in3 = _mm_slli_epi16(in3, 2);
- in4 = _mm_slli_epi16(in4, 2);
- in5 = _mm_slli_epi16(in5, 2);
- in6 = _mm_slli_epi16(in6, 2);
- in7 = _mm_slli_epi16(in7, 2);
-
- // We do two passes, first the columns, then the rows. The results of the
- // first pass are transposed so that the same column code can be reused. The
- // results of the second pass are also transposed so that the rows (processed
- // as columns) are put back in row positions.
- for (pass = 0; pass < 2; pass++) {
- // To store results of each pass before the transpose.
- __m128i res0, res1, res2, res3, res4, res5, res6, res7;
- // Add/subtract
- const __m128i q0 = ADD_EPI16(in0, in7);
- const __m128i q1 = ADD_EPI16(in1, in6);
- const __m128i q2 = ADD_EPI16(in2, in5);
- const __m128i q3 = ADD_EPI16(in3, in4);
- const __m128i q4 = SUB_EPI16(in3, in4);
- const __m128i q5 = SUB_EPI16(in2, in5);
- const __m128i q6 = SUB_EPI16(in1, in6);
- const __m128i q7 = SUB_EPI16(in0, in7);
-#if DCT_HIGH_BIT_DEPTH
- if (pass == 1) {
- overflow =
- check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
- if (overflow) {
- aom_highbd_fdct8x8_c(input, output, stride);
- return;
- }
- }
-#endif // DCT_HIGH_BIT_DEPTH
- // Work on first four results
- {
- // Add/subtract
- const __m128i r0 = ADD_EPI16(q0, q3);
- const __m128i r1 = ADD_EPI16(q1, q2);
- const __m128i r2 = SUB_EPI16(q1, q2);
- const __m128i r3 = SUB_EPI16(q0, q3);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
- if (overflow) {
- aom_highbd_fdct8x8_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- // Interleave to do the multiply by constants which gets us into 32bits
- {
- const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
- const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
- const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
- const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res0 = _mm_packs_epi32(w0, w1);
- res4 = _mm_packs_epi32(w2, w3);
- res2 = _mm_packs_epi32(w4, w5);
- res6 = _mm_packs_epi32(w6, w7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
- if (overflow) {
- aom_highbd_fdct8x8_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- }
- // Work on next four results
- {
- // Interleave to do the multiply by constants which gets us into 32bits
- const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
- const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
- const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
- const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
- const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
- const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
- const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
- const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
- const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
- const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
- const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
- const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
- const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
- // Combine
- const __m128i r0 = _mm_packs_epi32(s0, s1);
- const __m128i r1 = _mm_packs_epi32(s2, s3);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x2(&r0, &r1);
- if (overflow) {
- aom_highbd_fdct8x8_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- {
- // Add/subtract
- const __m128i x0 = ADD_EPI16(q4, r0);
- const __m128i x1 = SUB_EPI16(q4, r0);
- const __m128i x2 = SUB_EPI16(q7, r1);
- const __m128i x3 = ADD_EPI16(q7, r1);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
- if (overflow) {
- aom_highbd_fdct8x8_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- // Interleave to do the multiply by constants which gets us into 32bits
- {
- const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
- const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
- const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
- const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res1 = _mm_packs_epi32(w0, w1);
- res7 = _mm_packs_epi32(w2, w3);
- res5 = _mm_packs_epi32(w4, w5);
- res3 = _mm_packs_epi32(w6, w7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
- if (overflow) {
- aom_highbd_fdct8x8_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- }
- }
- // Transpose the 8x8.
- {
- // 00 01 02 03 04 05 06 07
- // 10 11 12 13 14 15 16 17
- // 20 21 22 23 24 25 26 27
- // 30 31 32 33 34 35 36 37
- // 40 41 42 43 44 45 46 47
- // 50 51 52 53 54 55 56 57
- // 60 61 62 63 64 65 66 67
- // 70 71 72 73 74 75 76 77
- const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
- const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
- const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
- const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
- const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
- const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
- const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
- const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 54 54 55 55 56 56 57 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 21 36
- // 44 54 64 74 45 55 61 76
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
- in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
- in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
- in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
- in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
- in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
- in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
- in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- }
- }
- // Post-condition output and store it
- {
- // Post-condition (division by two)
- // division of two 16 bits signed numbers using shifts
- // n / 2 = (n - (n >> 15)) >> 1
- const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
- const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
- const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
- const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
- const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
- const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
- const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
- const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
- in0 = _mm_sub_epi16(in0, sign_in0);
- in1 = _mm_sub_epi16(in1, sign_in1);
- in2 = _mm_sub_epi16(in2, sign_in2);
- in3 = _mm_sub_epi16(in3, sign_in3);
- in4 = _mm_sub_epi16(in4, sign_in4);
- in5 = _mm_sub_epi16(in5, sign_in5);
- in6 = _mm_sub_epi16(in6, sign_in6);
- in7 = _mm_sub_epi16(in7, sign_in7);
- in0 = _mm_srai_epi16(in0, 1);
- in1 = _mm_srai_epi16(in1, 1);
- in2 = _mm_srai_epi16(in2, 1);
- in3 = _mm_srai_epi16(in3, 1);
- in4 = _mm_srai_epi16(in4, 1);
- in5 = _mm_srai_epi16(in5, 1);
- in6 = _mm_srai_epi16(in6, 1);
- in7 = _mm_srai_epi16(in7, 1);
- // store results
- store_output(&in0, (output + 0 * 8));
- store_output(&in1, (output + 1 * 8));
- store_output(&in2, (output + 2 * 8));
- store_output(&in3, (output + 3 * 8));
- store_output(&in4, (output + 4 * 8));
- store_output(&in5, (output + 5 * 8));
- store_output(&in6, (output + 6 * 8));
- store_output(&in7, (output + 7 * 8));
- }
-}
-
-void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
- // The 2D transform is done with two passes which are actually pretty
- // similar. In the first one, we transform the columns and transpose
- // the results. In the second one, we transform the rows. To achieve that,
- // as the first pass results are transposed, we transpose the columns (that
- // is the transposed rows) and transpose the results (so that it goes back
- // in normal/row positions).
- int pass;
- // We need an intermediate buffer between passes.
- DECLARE_ALIGNED(16, int16_t, intermediate[256]);
- const int16_t *in = input;
- int16_t *out0 = intermediate;
- tran_low_t *out1 = output;
- // Constants
- // When we use them, in one case, they are all the same. In all others
- // it's a pair of them that we need to repeat four times. This is done
- // by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
- const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
- const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
- const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
- const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
- const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
- const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i kOne = _mm_set1_epi16(1);
- // Do the two transform/transpose passes
- for (pass = 0; pass < 2; ++pass) {
- // We process eight columns (transposed rows in second pass) at a time.
- int column_start;
-#if DCT_HIGH_BIT_DEPTH
- int overflow;
-#endif
- for (column_start = 0; column_start < 16; column_start += 8) {
- __m128i in00, in01, in02, in03, in04, in05, in06, in07;
- __m128i in08, in09, in10, in11, in12, in13, in14, in15;
- __m128i input0, input1, input2, input3, input4, input5, input6, input7;
- __m128i step1_0, step1_1, step1_2, step1_3;
- __m128i step1_4, step1_5, step1_6, step1_7;
- __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
- __m128i step3_0, step3_1, step3_2, step3_3;
- __m128i step3_4, step3_5, step3_6, step3_7;
- __m128i res00, res01, res02, res03, res04, res05, res06, res07;
- __m128i res08, res09, res10, res11, res12, res13, res14, res15;
- // Load and pre-condition input.
- if (0 == pass) {
- in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
- in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
- in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
- in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
- in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
- in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
- in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
- in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
- in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
- in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
- in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
- in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
- in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
- in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
- in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
- in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
- // x = x << 2
- in00 = _mm_slli_epi16(in00, 2);
- in01 = _mm_slli_epi16(in01, 2);
- in02 = _mm_slli_epi16(in02, 2);
- in03 = _mm_slli_epi16(in03, 2);
- in04 = _mm_slli_epi16(in04, 2);
- in05 = _mm_slli_epi16(in05, 2);
- in06 = _mm_slli_epi16(in06, 2);
- in07 = _mm_slli_epi16(in07, 2);
- in08 = _mm_slli_epi16(in08, 2);
- in09 = _mm_slli_epi16(in09, 2);
- in10 = _mm_slli_epi16(in10, 2);
- in11 = _mm_slli_epi16(in11, 2);
- in12 = _mm_slli_epi16(in12, 2);
- in13 = _mm_slli_epi16(in13, 2);
- in14 = _mm_slli_epi16(in14, 2);
- in15 = _mm_slli_epi16(in15, 2);
- } else {
- in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
- in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
- in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
- in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
- in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
- in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
- in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
- in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
- in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
- in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
- in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
- in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
- in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
- in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
- in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
- in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
- // x = (x + 1) >> 2
- in00 = _mm_add_epi16(in00, kOne);
- in01 = _mm_add_epi16(in01, kOne);
- in02 = _mm_add_epi16(in02, kOne);
- in03 = _mm_add_epi16(in03, kOne);
- in04 = _mm_add_epi16(in04, kOne);
- in05 = _mm_add_epi16(in05, kOne);
- in06 = _mm_add_epi16(in06, kOne);
- in07 = _mm_add_epi16(in07, kOne);
- in08 = _mm_add_epi16(in08, kOne);
- in09 = _mm_add_epi16(in09, kOne);
- in10 = _mm_add_epi16(in10, kOne);
- in11 = _mm_add_epi16(in11, kOne);
- in12 = _mm_add_epi16(in12, kOne);
- in13 = _mm_add_epi16(in13, kOne);
- in14 = _mm_add_epi16(in14, kOne);
- in15 = _mm_add_epi16(in15, kOne);
- in00 = _mm_srai_epi16(in00, 2);
- in01 = _mm_srai_epi16(in01, 2);
- in02 = _mm_srai_epi16(in02, 2);
- in03 = _mm_srai_epi16(in03, 2);
- in04 = _mm_srai_epi16(in04, 2);
- in05 = _mm_srai_epi16(in05, 2);
- in06 = _mm_srai_epi16(in06, 2);
- in07 = _mm_srai_epi16(in07, 2);
- in08 = _mm_srai_epi16(in08, 2);
- in09 = _mm_srai_epi16(in09, 2);
- in10 = _mm_srai_epi16(in10, 2);
- in11 = _mm_srai_epi16(in11, 2);
- in12 = _mm_srai_epi16(in12, 2);
- in13 = _mm_srai_epi16(in13, 2);
- in14 = _mm_srai_epi16(in14, 2);
- in15 = _mm_srai_epi16(in15, 2);
- }
- in += 8;
- // Calculate input for the first 8 results.
- {
- input0 = ADD_EPI16(in00, in15);
- input1 = ADD_EPI16(in01, in14);
- input2 = ADD_EPI16(in02, in13);
- input3 = ADD_EPI16(in03, in12);
- input4 = ADD_EPI16(in04, in11);
- input5 = ADD_EPI16(in05, in10);
- input6 = ADD_EPI16(in06, in09);
- input7 = ADD_EPI16(in07, in08);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
- &input4, &input5, &input6, &input7);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // Calculate input for the next 8 results.
- {
- step1_0 = SUB_EPI16(in07, in08);
- step1_1 = SUB_EPI16(in06, in09);
- step1_2 = SUB_EPI16(in05, in10);
- step1_3 = SUB_EPI16(in04, in11);
- step1_4 = SUB_EPI16(in03, in12);
- step1_5 = SUB_EPI16(in02, in13);
- step1_6 = SUB_EPI16(in01, in14);
- step1_7 = SUB_EPI16(in00, in15);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
- &step1_4, &step1_5, &step1_6, &step1_7);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // Work on the first eight values; fdct8(input, even_results);
- {
- // Add/subtract
- const __m128i q0 = ADD_EPI16(input0, input7);
- const __m128i q1 = ADD_EPI16(input1, input6);
- const __m128i q2 = ADD_EPI16(input2, input5);
- const __m128i q3 = ADD_EPI16(input3, input4);
- const __m128i q4 = SUB_EPI16(input3, input4);
- const __m128i q5 = SUB_EPI16(input2, input5);
- const __m128i q6 = SUB_EPI16(input1, input6);
- const __m128i q7 = SUB_EPI16(input0, input7);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- // Work on first four results
- {
- // Add/subtract
- const __m128i r0 = ADD_EPI16(q0, q3);
- const __m128i r1 = ADD_EPI16(q1, q2);
- const __m128i r2 = SUB_EPI16(q1, q2);
- const __m128i r3 = SUB_EPI16(q0, q3);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- // Interleave to do the multiply by constants which gets us
- // into 32 bits.
- {
- const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
- const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
- const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
- const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
- res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- }
- // Work on next four results
- {
- // Interleave to do the multiply by constants which gets us
- // into 32 bits.
- const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
- const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
- const __m128i r0 =
- mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- const __m128i r1 =
- mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x2(&r0, &r1);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- {
- // Add/subtract
- const __m128i x0 = ADD_EPI16(q4, r0);
- const __m128i x1 = SUB_EPI16(q4, r0);
- const __m128i x2 = SUB_EPI16(q7, r1);
- const __m128i x3 = ADD_EPI16(q7, r1);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- // Interleave to do the multiply by constants which gets us
- // into 32 bits.
- {
- const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
- const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
- const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
- const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
- res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x4(&res02, &res14, &res10, &res06);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- }
- }
- }
- // Work on the next eight values; step1 -> odd_results
- {
- // step 2
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
- const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
- const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
- const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
- step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // step 3
- {
- step3_0 = ADD_EPI16(step1_0, step2_3);
- step3_1 = ADD_EPI16(step1_1, step2_2);
- step3_2 = SUB_EPI16(step1_1, step2_2);
- step3_3 = SUB_EPI16(step1_0, step2_3);
- step3_4 = SUB_EPI16(step1_7, step2_4);
- step3_5 = SUB_EPI16(step1_6, step2_5);
- step3_6 = ADD_EPI16(step1_6, step2_5);
- step3_7 = ADD_EPI16(step1_7, step2_4);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3,
- &step3_4, &step3_5, &step3_6, &step3_7);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // step 4
- {
- const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
- const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
- const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
- const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
- step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // step 5
- {
- step1_0 = ADD_EPI16(step3_0, step2_1);
- step1_1 = SUB_EPI16(step3_0, step2_1);
- step1_2 = ADD_EPI16(step3_3, step2_2);
- step1_3 = SUB_EPI16(step3_3, step2_2);
- step1_4 = SUB_EPI16(step3_4, step2_5);
- step1_5 = ADD_EPI16(step3_4, step2_5);
- step1_6 = SUB_EPI16(step3_7, step2_6);
- step1_7 = ADD_EPI16(step3_7, step2_6);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
- &step1_4, &step1_5, &step1_6, &step1_7);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // step 6
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
- const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
- const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
- const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
- res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
- const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
- const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
- const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
- res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- }
- // Transpose the results, do it as two 8x8 transposes.
- transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05,
- &res06, &res07, pass, out0, out1);
- transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13,
- &res14, &res15, pass, out0 + 8, out1 + 8);
- if (pass == 0) {
- out0 += 8 * 16;
- } else {
- out1 += 8 * 16;
- }
- }
- // Setup in/out for next pass.
- in = intermediate;
- }
-}
-
-#undef ADD_EPI16
-#undef SUB_EPI16
diff --git a/av1/common/x86/av1_fwd_txfm_sse2.c b/av1/common/x86/av1_fwd_txfm_sse2.c
deleted file mode 100644
index 081fe08..0000000
--- a/av1/common/x86/av1_fwd_txfm_sse2.c
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h> // SSE2
-
-#include "./aom_config.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/x86/fwd_txfm_sse2.h"
-
-void av1_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
- __m128i in0, in1;
- __m128i tmp;
- const __m128i zero = _mm_setzero_si128();
- in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
- in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
- in1 = _mm_unpacklo_epi64(
- in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
- in0 = _mm_unpacklo_epi64(
- in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
-
- tmp = _mm_add_epi16(in0, in1);
- in0 = _mm_unpacklo_epi16(zero, tmp);
- in1 = _mm_unpackhi_epi16(zero, tmp);
- in0 = _mm_srai_epi32(in0, 16);
- in1 = _mm_srai_epi32(in1, 16);
-
- tmp = _mm_add_epi32(in0, in1);
- in0 = _mm_unpacklo_epi32(tmp, zero);
- in1 = _mm_unpackhi_epi32(tmp, zero);
-
- tmp = _mm_add_epi32(in0, in1);
- in0 = _mm_srli_si128(tmp, 8);
-
- in1 = _mm_add_epi32(tmp, in0);
- in0 = _mm_slli_epi32(in1, 1);
- store_output(&in0, output);
-}
-
-void av1_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
- __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
- __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
- __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
- __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
- __m128i u0, u1, sum;
-
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
-
- in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-
- sum = _mm_add_epi16(u0, u1);
-
- in0 = _mm_add_epi16(in0, in1);
- in2 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, in0);
-
- u0 = _mm_setzero_si128();
- sum = _mm_add_epi16(sum, in2);
-
- in0 = _mm_unpacklo_epi16(u0, sum);
- in1 = _mm_unpackhi_epi16(u0, sum);
- in0 = _mm_srai_epi32(in0, 16);
- in1 = _mm_srai_epi32(in1, 16);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_unpacklo_epi32(sum, u0);
- in1 = _mm_unpackhi_epi32(sum, u0);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_srli_si128(sum, 8);
-
- in1 = _mm_add_epi32(sum, in0);
- store_output(&in1, output);
-}
-
-void av1_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
- int stride) {
- __m128i in0, in1, in2, in3;
- __m128i u0, u1;
- __m128i sum = _mm_setzero_si128();
- int i;
-
- for (i = 0; i < 2; ++i) {
- input += 8 * i;
- in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-
- sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- in0 = _mm_load_si128((const __m128i *)(input + 8 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 9 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 10 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 11 * stride));
-
- sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- in0 = _mm_load_si128((const __m128i *)(input + 12 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 13 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 14 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 15 * stride));
-
- sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- sum = _mm_add_epi16(sum, u1);
- }
-
- u0 = _mm_setzero_si128();
- in0 = _mm_unpacklo_epi16(u0, sum);
- in1 = _mm_unpackhi_epi16(u0, sum);
- in0 = _mm_srai_epi32(in0, 16);
- in1 = _mm_srai_epi32(in1, 16);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_unpacklo_epi32(sum, u0);
- in1 = _mm_unpackhi_epi32(sum, u0);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_srli_si128(sum, 8);
-
- in1 = _mm_add_epi32(sum, in0);
- in1 = _mm_srai_epi32(in1, 1);
- store_output(&in1, output);
-}
-
-void av1_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
- int stride) {
- __m128i in0, in1, in2, in3;
- __m128i u0, u1;
- __m128i sum = _mm_setzero_si128();
- int i;
-
- for (i = 0; i < 8; ++i) {
- in0 = _mm_load_si128((const __m128i *)(input + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 16));
- in3 = _mm_load_si128((const __m128i *)(input + 24));
-
- input += stride;
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- in0 = _mm_load_si128((const __m128i *)(input + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 16));
- in3 = _mm_load_si128((const __m128i *)(input + 24));
-
- input += stride;
- sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- in0 = _mm_load_si128((const __m128i *)(input + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 16));
- in3 = _mm_load_si128((const __m128i *)(input + 24));
-
- input += stride;
- sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- in0 = _mm_load_si128((const __m128i *)(input + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 16));
- in3 = _mm_load_si128((const __m128i *)(input + 24));
-
- input += stride;
- sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- sum = _mm_add_epi16(sum, u1);
- }
-
- u0 = _mm_setzero_si128();
- in0 = _mm_unpacklo_epi16(u0, sum);
- in1 = _mm_unpackhi_epi16(u0, sum);
- in0 = _mm_srai_epi32(in0, 16);
- in1 = _mm_srai_epi32(in1, 16);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_unpacklo_epi32(sum, u0);
- in1 = _mm_unpackhi_epi32(sum, u0);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_srli_si128(sum, 8);
-
- in1 = _mm_add_epi32(sum, in0);
- in1 = _mm_srai_epi32(in1, 3);
- store_output(&in1, output);
-}
-
-#define DCT_HIGH_BIT_DEPTH 0
-#define FDCT4x4_2D av1_fdct4x4_sse2
-#define FDCT8x8_2D av1_fdct8x8_sse2
-#define FDCT16x16_2D av1_fdct16x16_sse2
-#include "av1/common/x86/av1_fwd_txfm_impl_sse2.h"
-#undef FDCT4x4_2D
-#undef FDCT8x8_2D
-#undef FDCT16x16_2D
-
-#define FDCT32x32_2D av1_fdct32x32_rd_sse2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h"
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D av1_fdct32x32_sse2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h" // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-#undef DCT_HIGH_BIT_DEPTH
-
-#if CONFIG_AOM_HIGHBITDEPTH
-#define DCT_HIGH_BIT_DEPTH 1
-#define FDCT4x4_2D av1_highbd_fdct4x4_sse2
-#define FDCT8x8_2D av1_highbd_fdct8x8_sse2
-#define FDCT16x16_2D av1_highbd_fdct16x16_sse2
-#include "av1/common/x86/av1_fwd_txfm_impl_sse2.h" // NOLINT
-#undef FDCT4x4_2D
-#undef FDCT8x8_2D
-#undef FDCT16x16_2D
-
-#define FDCT32x32_2D av1_highbd_fdct32x32_rd_sse2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h" // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D av1_highbd_fdct32x32_sse2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h" // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-#undef DCT_HIGH_BIT_DEPTH
-#endif // CONFIG_AOM_HIGHBITDEPTH
diff --git a/av1/common/x86/av1_inv_txfm_sse2.c b/av1/common/x86/av1_inv_txfm_sse2.c
deleted file mode 100644
index 365c124..0000000
--- a/av1/common/x86/av1_inv_txfm_sse2.c
+++ /dev/null
@@ -1,4028 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./av1_rtcd.h"
-#include "av1/common/x86/av1_inv_txfm_sse2.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-#define RECON_AND_STORE4X4(dest, in_x) \
- { \
- __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
- d0 = _mm_unpacklo_epi8(d0, zero); \
- d0 = _mm_add_epi16(in_x, d0); \
- d0 = _mm_packus_epi16(d0, d0); \
- *(int *)(dest) = _mm_cvtsi128_si32(d0); \
- }
-
-void av1_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i cst = _mm_setr_epi16(
- (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
- (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
- (int16_t)cospi_8_64, (int16_t)cospi_24_64);
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i input0, input1, input2, input3;
-
- // Rows
- input0 = _mm_load_si128((const __m128i *)input);
- input2 = _mm_load_si128((const __m128i *)(input + 8));
-
- // Construct i3, i1, i3, i1, i2, i0, i2, i0
- input0 = _mm_shufflelo_epi16(input0, 0xd8);
- input0 = _mm_shufflehi_epi16(input0, 0xd8);
- input2 = _mm_shufflelo_epi16(input2, 0xd8);
- input2 = _mm_shufflehi_epi16(input2, 0xd8);
-
- input1 = _mm_unpackhi_epi32(input0, input0);
- input0 = _mm_unpacklo_epi32(input0, input0);
- input3 = _mm_unpackhi_epi32(input2, input2);
- input2 = _mm_unpacklo_epi32(input2, input2);
-
- // Stage 1
- input0 = _mm_madd_epi16(input0, cst);
- input1 = _mm_madd_epi16(input1, cst);
- input2 = _mm_madd_epi16(input2, cst);
- input3 = _mm_madd_epi16(input3, cst);
-
- input0 = _mm_add_epi32(input0, rounding);
- input1 = _mm_add_epi32(input1, rounding);
- input2 = _mm_add_epi32(input2, rounding);
- input3 = _mm_add_epi32(input3, rounding);
-
- input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
- input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
- input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
- input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
- // Stage 2
- input0 = _mm_packs_epi32(input0, input1);
- input1 = _mm_packs_epi32(input2, input3);
-
- // Transpose
- input2 = _mm_unpacklo_epi16(input0, input1);
- input3 = _mm_unpackhi_epi16(input0, input1);
- input0 = _mm_unpacklo_epi32(input2, input3);
- input1 = _mm_unpackhi_epi32(input2, input3);
-
- // Switch column2, column 3, and then, we got:
- // input2: column1, column 0; input3: column2, column 3.
- input1 = _mm_shuffle_epi32(input1, 0x4e);
- input2 = _mm_add_epi16(input0, input1);
- input3 = _mm_sub_epi16(input0, input1);
-
- // Columns
- // Construct i3, i1, i3, i1, i2, i0, i2, i0
- input0 = _mm_unpacklo_epi32(input2, input2);
- input1 = _mm_unpackhi_epi32(input2, input2);
- input2 = _mm_unpackhi_epi32(input3, input3);
- input3 = _mm_unpacklo_epi32(input3, input3);
-
- // Stage 1
- input0 = _mm_madd_epi16(input0, cst);
- input1 = _mm_madd_epi16(input1, cst);
- input2 = _mm_madd_epi16(input2, cst);
- input3 = _mm_madd_epi16(input3, cst);
-
- input0 = _mm_add_epi32(input0, rounding);
- input1 = _mm_add_epi32(input1, rounding);
- input2 = _mm_add_epi32(input2, rounding);
- input3 = _mm_add_epi32(input3, rounding);
-
- input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
- input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
- input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
- input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
- // Stage 2
- input0 = _mm_packs_epi32(input0, input2);
- input1 = _mm_packs_epi32(input1, input3);
-
- // Transpose
- input2 = _mm_unpacklo_epi16(input0, input1);
- input3 = _mm_unpackhi_epi16(input0, input1);
- input0 = _mm_unpacklo_epi32(input2, input3);
- input1 = _mm_unpackhi_epi32(input2, input3);
-
- // Switch column2, column 3, and then, we got:
- // input2: column1, column 0; input3: column2, column 3.
- input1 = _mm_shuffle_epi32(input1, 0x4e);
- input2 = _mm_add_epi16(input0, input1);
- input3 = _mm_sub_epi16(input0, input1);
-
- // Final round and shift
- input2 = _mm_add_epi16(input2, eight);
- input3 = _mm_add_epi16(input3, eight);
-
- input2 = _mm_srai_epi16(input2, 4);
- input3 = _mm_srai_epi16(input3, 4);
-
- // Reconstruction and Store
- {
- __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
- __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
- d0 = _mm_unpacklo_epi32(d0,
- _mm_cvtsi32_si128(*(const int *)(dest + stride)));
- d2 = _mm_unpacklo_epi32(
- _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
- d0 = _mm_unpacklo_epi8(d0, zero);
- d2 = _mm_unpacklo_epi8(d2, zero);
- d0 = _mm_add_epi16(d0, input2);
- d2 = _mm_add_epi16(d2, input3);
- d0 = _mm_packus_epi16(d0, d2);
- // store input0
- *(int *)dest = _mm_cvtsi128_si32(d0);
- // store input1
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
- // store input2
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
- // store input3
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
- }
-}
-
-void av1_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
- __m128i dc_value;
- const __m128i zero = _mm_setzero_si128();
- int a;
-
- a = (int)dct_const_round_shift(input[0] * cospi_16_64);
- a = (int)dct_const_round_shift(a * cospi_16_64);
- a = ROUND_POWER_OF_TWO(a, 4);
-
- dc_value = _mm_set1_epi16(a);
-
- RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
- RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
- RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
- RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
-}
-
-static INLINE void transpose_4x4(__m128i *res) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
- const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
- res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
- res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
-}
-
-void av1_idct4_sse2(__m128i *in) {
- const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i u[8], v[8];
-
- transpose_4x4(in);
- // stage 1
- u[0] = _mm_unpacklo_epi16(in[0], in[1]);
- u[1] = _mm_unpackhi_epi16(in[0], in[1]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
- v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
- u[0] = _mm_packs_epi32(v[0], v[1]);
- u[1] = _mm_packs_epi32(v[3], v[2]);
-
- // stage 2
- in[0] = _mm_add_epi16(u[0], u[1]);
- in[1] = _mm_sub_epi16(u[0], u[1]);
- in[1] = _mm_shuffle_epi32(in[1], 0x4E);
-}
-
-void av1_iadst4_sse2(__m128i *in) {
- const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
- const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
- const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
- const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
- const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
- const __m128i kZero = _mm_set1_epi16(0);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i u[8], v[8], in7;
-
- transpose_4x4(in);
- in7 = _mm_srli_si128(in[1], 8);
- in7 = _mm_add_epi16(in7, in[0]);
- in7 = _mm_sub_epi16(in7, in[1]);
-
- u[0] = _mm_unpacklo_epi16(in[0], in[1]);
- u[1] = _mm_unpackhi_epi16(in[0], in[1]);
- u[2] = _mm_unpacklo_epi16(in7, kZero);
- u[3] = _mm_unpackhi_epi16(in[0], kZero);
-
- v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3
- v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5
- v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2
- v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4
- v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6
- v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2
-
- u[0] = _mm_add_epi32(v[0], v[1]);
- u[1] = _mm_add_epi32(v[3], v[4]);
- u[2] = v[2];
- u[3] = _mm_add_epi32(u[0], u[1]);
- u[4] = _mm_slli_epi32(v[5], 2);
- u[5] = _mm_add_epi32(u[3], v[5]);
- u[6] = _mm_sub_epi32(u[5], u[4]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u[0], u[1]);
- in[1] = _mm_packs_epi32(u[2], u[3]);
-}
-
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3, out4, out5, out6, out7) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
- const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
- const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
- const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
- const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
- const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
- \
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
- \
- out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
- out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
- out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
- out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
- out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
- out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
- out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
- out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
- }
-
-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
- { \
- const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
- const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
- const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
- \
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
- \
- out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
- out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
- out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
- out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
- }
-
-#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- }
-
-// Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
- res0, res1, res2, res3) \
- { \
- tmp0 = _mm_madd_epi16(lo_0, cst0); \
- tmp1 = _mm_madd_epi16(hi_0, cst0); \
- tmp2 = _mm_madd_epi16(lo_0, cst1); \
- tmp3 = _mm_madd_epi16(hi_0, cst1); \
- tmp4 = _mm_madd_epi16(lo_1, cst2); \
- tmp5 = _mm_madd_epi16(hi_1, cst2); \
- tmp6 = _mm_madd_epi16(lo_1, cst3); \
- tmp7 = _mm_madd_epi16(hi_1, cst3); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- tmp4 = _mm_add_epi32(tmp4, rounding); \
- tmp5 = _mm_add_epi32(tmp5, rounding); \
- tmp6 = _mm_add_epi32(tmp6, rounding); \
- tmp7 = _mm_add_epi32(tmp7, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
- \
- res0 = _mm_packs_epi32(tmp0, tmp1); \
- res1 = _mm_packs_epi32(tmp2, tmp3); \
- res2 = _mm_packs_epi32(tmp4, tmp5); \
- res3 = _mm_packs_epi32(tmp6, tmp7); \
- }
-
-#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
- { \
- tmp0 = _mm_madd_epi16(lo_0, cst0); \
- tmp1 = _mm_madd_epi16(hi_0, cst0); \
- tmp2 = _mm_madd_epi16(lo_0, cst1); \
- tmp3 = _mm_madd_epi16(hi_0, cst1); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- res0 = _mm_packs_epi32(tmp0, tmp1); \
- res1 = _mm_packs_epi32(tmp2, tmp3); \
- }
-
-#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
- out4, out5, out6, out7) \
- { \
- /* Stage1 */ \
- { \
- const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
- const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
- const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
- const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
- \
- MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1, \
- stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6) \
- } \
- \
- /* Stage2 */ \
- { \
- const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
- const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
- const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
- const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
- \
- MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \
- stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \
- \
- stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
- stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
- stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
- stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- \
- stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
- stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
- stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
- stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
- \
- tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
- tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
- tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
- tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- } \
- \
- /* Stage4 */ \
- out0 = _mm_adds_epi16(stp1_0, stp2_7); \
- out1 = _mm_adds_epi16(stp1_1, stp1_6); \
- out2 = _mm_adds_epi16(stp1_2, stp1_5); \
- out3 = _mm_adds_epi16(stp1_3, stp2_4); \
- out4 = _mm_subs_epi16(stp1_3, stp2_4); \
- out5 = _mm_subs_epi16(stp1_2, stp1_5); \
- out6 = _mm_subs_epi16(stp1_1, stp1_6); \
- out7 = _mm_subs_epi16(stp1_0, stp2_7); \
- }
-
-void av1_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 4);
- const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i;
-
- // Load input data.
- in0 = _mm_load_si128((const __m128i *)input);
- in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
- in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
- in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
- in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
- in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
- in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
- in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
-
- // 2-D
- for (i = 0; i < 2; i++) {
- // 8x8 Transpose is copied from av1_fdct8x8_sse2()
- TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
-
- // 4-stage 1D av1_idct8x8
- IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
- in6, in7);
- }
-
- // Final rounding and shift
- in0 = _mm_adds_epi16(in0, final_rounding);
- in1 = _mm_adds_epi16(in1, final_rounding);
- in2 = _mm_adds_epi16(in2, final_rounding);
- in3 = _mm_adds_epi16(in3, final_rounding);
- in4 = _mm_adds_epi16(in4, final_rounding);
- in5 = _mm_adds_epi16(in5, final_rounding);
- in6 = _mm_adds_epi16(in6, final_rounding);
- in7 = _mm_adds_epi16(in7, final_rounding);
-
- in0 = _mm_srai_epi16(in0, 5);
- in1 = _mm_srai_epi16(in1, 5);
- in2 = _mm_srai_epi16(in2, 5);
- in3 = _mm_srai_epi16(in3, 5);
- in4 = _mm_srai_epi16(in4, 5);
- in5 = _mm_srai_epi16(in5, 5);
- in6 = _mm_srai_epi16(in6, 5);
- in7 = _mm_srai_epi16(in7, 5);
-
- RECON_AND_STORE(dest + 0 * stride, in0);
- RECON_AND_STORE(dest + 1 * stride, in1);
- RECON_AND_STORE(dest + 2 * stride, in2);
- RECON_AND_STORE(dest + 3 * stride, in3);
- RECON_AND_STORE(dest + 4 * stride, in4);
- RECON_AND_STORE(dest + 5 * stride, in5);
- RECON_AND_STORE(dest + 6 * stride, in6);
- RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-void av1_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
- __m128i dc_value;
- const __m128i zero = _mm_setzero_si128();
- int a;
-
- a = (int)dct_const_round_shift(input[0] * cospi_16_64);
- a = (int)dct_const_round_shift(a * cospi_16_64);
- a = ROUND_POWER_OF_TWO(a, 5);
-
- dc_value = _mm_set1_epi16(a);
-
- RECON_AND_STORE(dest + 0 * stride, dc_value);
- RECON_AND_STORE(dest + 1 * stride, dc_value);
- RECON_AND_STORE(dest + 2 * stride, dc_value);
- RECON_AND_STORE(dest + 3 * stride, dc_value);
- RECON_AND_STORE(dest + 4 * stride, dc_value);
- RECON_AND_STORE(dest + 5 * stride, dc_value);
- RECON_AND_STORE(dest + 6 * stride, dc_value);
- RECON_AND_STORE(dest + 7 * stride, dc_value);
-}
-
-void av1_idct8_sse2(__m128i *in) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
- // 8x8 Transpose is copied from av1_fdct8x8_sse2()
- TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
- in1, in2, in3, in4, in5, in6, in7);
-
- // 4-stage 1D av1_idct8x8
- IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
- in[4], in[5], in[6], in[7]);
-}
-
-void av1_iadst8_sse2(__m128i *in) {
- const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
- const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__const_0 = _mm_set1_epi16(0);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
- __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
- __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
- __m128i s0, s1, s2, s3, s4, s5, s6, s7;
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-
- // transpose
- array_transpose_8x8(in, in);
-
- // properly aligned for butterfly input
- in0 = in[7];
- in1 = in[0];
- in2 = in[5];
- in3 = in[2];
- in4 = in[3];
- in5 = in[4];
- in6 = in[1];
- in7 = in[6];
-
- // column transformation
- // stage 1
- // interleave and multiply/add into 32-bit integer
- s0 = _mm_unpacklo_epi16(in0, in1);
- s1 = _mm_unpackhi_epi16(in0, in1);
- s2 = _mm_unpacklo_epi16(in2, in3);
- s3 = _mm_unpackhi_epi16(in2, in3);
- s4 = _mm_unpacklo_epi16(in4, in5);
- s5 = _mm_unpackhi_epi16(in4, in5);
- s6 = _mm_unpacklo_epi16(in6, in7);
- s7 = _mm_unpackhi_epi16(in6, in7);
-
- u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
- u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
- u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
- u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
- u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
- u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
- u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
- u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
- u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
- u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
- u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
- u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
- u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
- u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
- u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
- u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
-
- // addition
- w0 = _mm_add_epi32(u0, u8);
- w1 = _mm_add_epi32(u1, u9);
- w2 = _mm_add_epi32(u2, u10);
- w3 = _mm_add_epi32(u3, u11);
- w4 = _mm_add_epi32(u4, u12);
- w5 = _mm_add_epi32(u5, u13);
- w6 = _mm_add_epi32(u6, u14);
- w7 = _mm_add_epi32(u7, u15);
- w8 = _mm_sub_epi32(u0, u8);
- w9 = _mm_sub_epi32(u1, u9);
- w10 = _mm_sub_epi32(u2, u10);
- w11 = _mm_sub_epi32(u3, u11);
- w12 = _mm_sub_epi32(u4, u12);
- w13 = _mm_sub_epi32(u5, u13);
- w14 = _mm_sub_epi32(u6, u14);
- w15 = _mm_sub_epi32(u7, u15);
-
- // shift and rounding
- v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
- v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
- v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
- v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
- v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
- v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
- v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
- v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
- v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
- v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
- v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
- v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
- v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
- v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
- v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
- v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
- u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
- u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
- u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
- u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
- u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
- u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
- u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
- u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
-
- // back to 16-bit and pack 8 integers into __m128i
- in[0] = _mm_packs_epi32(u0, u1);
- in[1] = _mm_packs_epi32(u2, u3);
- in[2] = _mm_packs_epi32(u4, u5);
- in[3] = _mm_packs_epi32(u6, u7);
- in[4] = _mm_packs_epi32(u8, u9);
- in[5] = _mm_packs_epi32(u10, u11);
- in[6] = _mm_packs_epi32(u12, u13);
- in[7] = _mm_packs_epi32(u14, u15);
-
- // stage 2
- s0 = _mm_add_epi16(in[0], in[2]);
- s1 = _mm_add_epi16(in[1], in[3]);
- s2 = _mm_sub_epi16(in[0], in[2]);
- s3 = _mm_sub_epi16(in[1], in[3]);
- u0 = _mm_unpacklo_epi16(in[4], in[5]);
- u1 = _mm_unpackhi_epi16(in[4], in[5]);
- u2 = _mm_unpacklo_epi16(in[6], in[7]);
- u3 = _mm_unpackhi_epi16(in[6], in[7]);
-
- v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
- v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
- v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
- v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
- v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
- v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
- v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
- v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
-
- w0 = _mm_add_epi32(v0, v4);
- w1 = _mm_add_epi32(v1, v5);
- w2 = _mm_add_epi32(v2, v6);
- w3 = _mm_add_epi32(v3, v7);
- w4 = _mm_sub_epi32(v0, v4);
- w5 = _mm_sub_epi32(v1, v5);
- w6 = _mm_sub_epi32(v2, v6);
- w7 = _mm_sub_epi32(v3, v7);
-
- v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
- v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
- v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
- v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
- v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
- v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
- v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
- v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
- u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
- // back to 16-bit intergers
- s4 = _mm_packs_epi32(u0, u1);
- s5 = _mm_packs_epi32(u2, u3);
- s6 = _mm_packs_epi32(u4, u5);
- s7 = _mm_packs_epi32(u6, u7);
-
- // stage 3
- u0 = _mm_unpacklo_epi16(s2, s3);
- u1 = _mm_unpackhi_epi16(s2, s3);
- u2 = _mm_unpacklo_epi16(s6, s7);
- u3 = _mm_unpackhi_epi16(s6, s7);
-
- v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
- v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
- v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
- v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
- v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
- v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
- v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
- v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
- u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
- u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
- u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
- u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
- u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
- u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
- u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
- u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
- v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
- v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
- v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
- v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
- v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
- v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
- v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
- v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
- s2 = _mm_packs_epi32(v0, v1);
- s3 = _mm_packs_epi32(v2, v3);
- s6 = _mm_packs_epi32(v4, v5);
- s7 = _mm_packs_epi32(v6, v7);
-
- in[0] = s0;
- in[1] = _mm_sub_epi16(k__const_0, s4);
- in[2] = s6;
- in[3] = _mm_sub_epi16(k__const_0, s2);
- in[4] = s3;
- in[5] = _mm_sub_epi16(k__const_0, s7);
- in[6] = s5;
- in[7] = _mm_sub_epi16(k__const_0, s1);
-}
-
-void av1_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 4);
- const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
- // Rows. Load 4-row input data.
- in0 = _mm_load_si128((const __m128i *)input);
- in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
- in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
- in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
-
- // 8x4 Transpose
- TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
- // Stage1
- {
- const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
- const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
-
- tmp0 = _mm_madd_epi16(lo_17, stg1_0);
- tmp2 = _mm_madd_epi16(lo_17, stg1_1);
- tmp4 = _mm_madd_epi16(lo_35, stg1_2);
- tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
- stp1_4 = _mm_packs_epi32(tmp0, tmp2);
- stp1_5 = _mm_packs_epi32(tmp4, tmp6);
- }
-
- // Stage2
- {
- const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
- const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
-
- tmp0 = _mm_madd_epi16(lo_04, stg2_0);
- tmp2 = _mm_madd_epi16(lo_04, stg2_1);
- tmp4 = _mm_madd_epi16(lo_26, stg2_2);
- tmp6 = _mm_madd_epi16(lo_26, stg2_3);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
- stp2_0 = _mm_packs_epi32(tmp0, tmp2);
- stp2_2 = _mm_packs_epi32(tmp6, tmp4);
-
- tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
- tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
-
- stp2_4 = tmp0;
- stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
- stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
- }
-
- // Stage3
- {
- const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-
- tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
- tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
-
- stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
- stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
-
- tmp0 = _mm_madd_epi16(lo_56, stg3_0);
- tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
- stp1_5 = _mm_packs_epi32(tmp0, tmp2);
- }
-
- // Stage4
- tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
- tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
- tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
- tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
-
- TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
- IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
- in5, in6, in7);
- // Final rounding and shift
- in0 = _mm_adds_epi16(in0, final_rounding);
- in1 = _mm_adds_epi16(in1, final_rounding);
- in2 = _mm_adds_epi16(in2, final_rounding);
- in3 = _mm_adds_epi16(in3, final_rounding);
- in4 = _mm_adds_epi16(in4, final_rounding);
- in5 = _mm_adds_epi16(in5, final_rounding);
- in6 = _mm_adds_epi16(in6, final_rounding);
- in7 = _mm_adds_epi16(in7, final_rounding);
-
- in0 = _mm_srai_epi16(in0, 5);
- in1 = _mm_srai_epi16(in1, 5);
- in2 = _mm_srai_epi16(in2, 5);
- in3 = _mm_srai_epi16(in3, 5);
- in4 = _mm_srai_epi16(in4, 5);
- in5 = _mm_srai_epi16(in5, 5);
- in6 = _mm_srai_epi16(in6, 5);
- in7 = _mm_srai_epi16(in7, 5);
-
- RECON_AND_STORE(dest + 0 * stride, in0);
- RECON_AND_STORE(dest + 1 * stride, in1);
- RECON_AND_STORE(dest + 2 * stride, in2);
- RECON_AND_STORE(dest + 3 * stride, in3);
- RECON_AND_STORE(dest + 4 * stride, in4);
- RECON_AND_STORE(dest + 5 * stride, in5);
- RECON_AND_STORE(dest + 6 * stride, in6);
- RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-#define IDCT16 \
- /* Stage2 */ \
- { \
- const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
- const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
- const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
- const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
- const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
- const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
- const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
- const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
- \
- MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1, \
- stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) \
- \
- MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
- stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
- const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
- const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
- const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
- \
- MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
- stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) \
- \
- stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
- stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
- stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
- \
- stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
- stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
- } \
- \
- /* Stage4 */ \
- { \
- const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
- const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
- const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
- const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- \
- MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1, \
- stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \
- \
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
- stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \
- stp2_13) \
- } \
- \
- /* Stage5 */ \
- { \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
- \
- stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
- } \
- \
- /* Stage6 */ \
- { \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
- stp2_12) \
- }
-
-#define IDCT16_10 \
- /* Stage2 */ \
- { \
- const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
- const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
- const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
- const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
- \
- MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
- stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11, \
- stp1_12_0) \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
- const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
- \
- MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
- \
- stp1_9 = stp1_8_0; \
- stp1_10 = stp1_11; \
- \
- stp1_13 = stp1_12_0; \
- stp1_14 = stp1_15; \
- } \
- \
- /* Stage4 */ \
- { \
- const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
- const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- \
- MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1) \
- stp2_5 = stp2_4; \
- stp2_6 = stp2_7; \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
- stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \
- stp2_13) \
- } \
- \
- /* Stage5 */ \
- { \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- \
- stp1_2 = stp1_1; \
- stp1_3 = stp1_0; \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
- \
- stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
- } \
- \
- /* Stage6 */ \
- { \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
- stp2_12) \
- }
-
-void av1_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
- int stride) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- const __m128i zero = _mm_setzero_si128();
-
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
- __m128i in[16], l[16], r[16], *curr1;
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_8_0, stp1_12_0;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i;
-
- curr1 = l;
- for (i = 0; i < 2; i++) {
- // 1-D av1_idct
-
- // Load input data.
- in[0] = _mm_load_si128((const __m128i *)input);
- in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
- in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
- in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
- in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
- in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
- in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
- in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
- in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
- in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
- in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
- in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
- in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
- in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
- in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
- in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
-
- array_transpose_8x8(in, in);
- array_transpose_8x8(in + 8, in + 8);
-
- IDCT16
-
- // Stage7
- curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
- curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
- curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
- curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
- curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
- curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
- curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
- curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
- curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
- curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
- curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
- curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
- curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
- curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
- curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
- curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
- curr1 = r;
- input += 128;
- }
- for (i = 0; i < 2; i++) {
- int j;
- // 1-D av1_idct
- array_transpose_8x8(l + i * 8, in);
- array_transpose_8x8(r + i * 8, in + 8);
-
- IDCT16
-
- // 2-D
- in[0] = _mm_add_epi16(stp2_0, stp1_15);
- in[1] = _mm_add_epi16(stp2_1, stp1_14);
- in[2] = _mm_add_epi16(stp2_2, stp2_13);
- in[3] = _mm_add_epi16(stp2_3, stp2_12);
- in[4] = _mm_add_epi16(stp2_4, stp2_11);
- in[5] = _mm_add_epi16(stp2_5, stp2_10);
- in[6] = _mm_add_epi16(stp2_6, stp1_9);
- in[7] = _mm_add_epi16(stp2_7, stp1_8);
- in[8] = _mm_sub_epi16(stp2_7, stp1_8);
- in[9] = _mm_sub_epi16(stp2_6, stp1_9);
- in[10] = _mm_sub_epi16(stp2_5, stp2_10);
- in[11] = _mm_sub_epi16(stp2_4, stp2_11);
- in[12] = _mm_sub_epi16(stp2_3, stp2_12);
- in[13] = _mm_sub_epi16(stp2_2, stp2_13);
- in[14] = _mm_sub_epi16(stp2_1, stp1_14);
- in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
- for (j = 0; j < 16; ++j) {
- // Final rounding and shift
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j] = _mm_srai_epi16(in[j], 6);
- RECON_AND_STORE(dest + j * stride, in[j]);
- }
-
- dest += 8;
- }
-}
-
-void av1_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
- __m128i dc_value;
- const __m128i zero = _mm_setzero_si128();
- int a, i;
-
- a = (int)dct_const_round_shift(input[0] * cospi_16_64);
- a = (int)dct_const_round_shift(a * cospi_16_64);
- a = ROUND_POWER_OF_TWO(a, 6);
-
- dc_value = _mm_set1_epi16(a);
-
- for (i = 0; i < 2; ++i) {
- RECON_AND_STORE(dest + 0 * stride, dc_value);
- RECON_AND_STORE(dest + 1 * stride, dc_value);
- RECON_AND_STORE(dest + 2 * stride, dc_value);
- RECON_AND_STORE(dest + 3 * stride, dc_value);
- RECON_AND_STORE(dest + 4 * stride, dc_value);
- RECON_AND_STORE(dest + 5 * stride, dc_value);
- RECON_AND_STORE(dest + 6 * stride, dc_value);
- RECON_AND_STORE(dest + 7 * stride, dc_value);
- RECON_AND_STORE(dest + 8 * stride, dc_value);
- RECON_AND_STORE(dest + 9 * stride, dc_value);
- RECON_AND_STORE(dest + 10 * stride, dc_value);
- RECON_AND_STORE(dest + 11 * stride, dc_value);
- RECON_AND_STORE(dest + 12 * stride, dc_value);
- RECON_AND_STORE(dest + 13 * stride, dc_value);
- RECON_AND_STORE(dest + 14 * stride, dc_value);
- RECON_AND_STORE(dest + 15 * stride, dc_value);
- dest += 8;
- }
-}
-
-static void av1_iadst16_8col(__m128i *in) {
- // perform 16x16 1-D ADST for 8 columns
- __m128i s[16], x[16], u[32], v[32];
- const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
- const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
- const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
- const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
- const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
- const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
- const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
- const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
- const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
- const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
- const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
- const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
- const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
- const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
- const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
- const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
- const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i kZero = _mm_set1_epi16(0);
-
- u[0] = _mm_unpacklo_epi16(in[15], in[0]);
- u[1] = _mm_unpackhi_epi16(in[15], in[0]);
- u[2] = _mm_unpacklo_epi16(in[13], in[2]);
- u[3] = _mm_unpackhi_epi16(in[13], in[2]);
- u[4] = _mm_unpacklo_epi16(in[11], in[4]);
- u[5] = _mm_unpackhi_epi16(in[11], in[4]);
- u[6] = _mm_unpacklo_epi16(in[9], in[6]);
- u[7] = _mm_unpackhi_epi16(in[9], in[6]);
- u[8] = _mm_unpacklo_epi16(in[7], in[8]);
- u[9] = _mm_unpackhi_epi16(in[7], in[8]);
- u[10] = _mm_unpacklo_epi16(in[5], in[10]);
- u[11] = _mm_unpackhi_epi16(in[5], in[10]);
- u[12] = _mm_unpacklo_epi16(in[3], in[12]);
- u[13] = _mm_unpackhi_epi16(in[3], in[12]);
- u[14] = _mm_unpacklo_epi16(in[1], in[14]);
- u[15] = _mm_unpackhi_epi16(in[1], in[14]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
- v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
- v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
- v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
- v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
- v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
- v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
- v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
- v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
- v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
- v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
- v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
- v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
- v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
- v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
- v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
- v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
- v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
- v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
-
- u[0] = _mm_add_epi32(v[0], v[16]);
- u[1] = _mm_add_epi32(v[1], v[17]);
- u[2] = _mm_add_epi32(v[2], v[18]);
- u[3] = _mm_add_epi32(v[3], v[19]);
- u[4] = _mm_add_epi32(v[4], v[20]);
- u[5] = _mm_add_epi32(v[5], v[21]);
- u[6] = _mm_add_epi32(v[6], v[22]);
- u[7] = _mm_add_epi32(v[7], v[23]);
- u[8] = _mm_add_epi32(v[8], v[24]);
- u[9] = _mm_add_epi32(v[9], v[25]);
- u[10] = _mm_add_epi32(v[10], v[26]);
- u[11] = _mm_add_epi32(v[11], v[27]);
- u[12] = _mm_add_epi32(v[12], v[28]);
- u[13] = _mm_add_epi32(v[13], v[29]);
- u[14] = _mm_add_epi32(v[14], v[30]);
- u[15] = _mm_add_epi32(v[15], v[31]);
- u[16] = _mm_sub_epi32(v[0], v[16]);
- u[17] = _mm_sub_epi32(v[1], v[17]);
- u[18] = _mm_sub_epi32(v[2], v[18]);
- u[19] = _mm_sub_epi32(v[3], v[19]);
- u[20] = _mm_sub_epi32(v[4], v[20]);
- u[21] = _mm_sub_epi32(v[5], v[21]);
- u[22] = _mm_sub_epi32(v[6], v[22]);
- u[23] = _mm_sub_epi32(v[7], v[23]);
- u[24] = _mm_sub_epi32(v[8], v[24]);
- u[25] = _mm_sub_epi32(v[9], v[25]);
- u[26] = _mm_sub_epi32(v[10], v[26]);
- u[27] = _mm_sub_epi32(v[11], v[27]);
- u[28] = _mm_sub_epi32(v[12], v[28]);
- u[29] = _mm_sub_epi32(v[13], v[29]);
- u[30] = _mm_sub_epi32(v[14], v[30]);
- u[31] = _mm_sub_epi32(v[15], v[31]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
- v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
- v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
- v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
- v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
- v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
- v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
- v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
- v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
- v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
- v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
- v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
- v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
- v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
- v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
- v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
- v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
- u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
- u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
- u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
- u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
- u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
- u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
- u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
- u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
- u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
- u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
- u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
- u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
- u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
- u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
- u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
- u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
-
- s[0] = _mm_packs_epi32(u[0], u[1]);
- s[1] = _mm_packs_epi32(u[2], u[3]);
- s[2] = _mm_packs_epi32(u[4], u[5]);
- s[3] = _mm_packs_epi32(u[6], u[7]);
- s[4] = _mm_packs_epi32(u[8], u[9]);
- s[5] = _mm_packs_epi32(u[10], u[11]);
- s[6] = _mm_packs_epi32(u[12], u[13]);
- s[7] = _mm_packs_epi32(u[14], u[15]);
- s[8] = _mm_packs_epi32(u[16], u[17]);
- s[9] = _mm_packs_epi32(u[18], u[19]);
- s[10] = _mm_packs_epi32(u[20], u[21]);
- s[11] = _mm_packs_epi32(u[22], u[23]);
- s[12] = _mm_packs_epi32(u[24], u[25]);
- s[13] = _mm_packs_epi32(u[26], u[27]);
- s[14] = _mm_packs_epi32(u[28], u[29]);
- s[15] = _mm_packs_epi32(u[30], u[31]);
-
- // stage 2
- u[0] = _mm_unpacklo_epi16(s[8], s[9]);
- u[1] = _mm_unpackhi_epi16(s[8], s[9]);
- u[2] = _mm_unpacklo_epi16(s[10], s[11]);
- u[3] = _mm_unpackhi_epi16(s[10], s[11]);
- u[4] = _mm_unpacklo_epi16(s[12], s[13]);
- u[5] = _mm_unpackhi_epi16(s[12], s[13]);
- u[6] = _mm_unpacklo_epi16(s[14], s[15]);
- u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
- v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
- v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
-
- u[0] = _mm_add_epi32(v[0], v[8]);
- u[1] = _mm_add_epi32(v[1], v[9]);
- u[2] = _mm_add_epi32(v[2], v[10]);
- u[3] = _mm_add_epi32(v[3], v[11]);
- u[4] = _mm_add_epi32(v[4], v[12]);
- u[5] = _mm_add_epi32(v[5], v[13]);
- u[6] = _mm_add_epi32(v[6], v[14]);
- u[7] = _mm_add_epi32(v[7], v[15]);
- u[8] = _mm_sub_epi32(v[0], v[8]);
- u[9] = _mm_sub_epi32(v[1], v[9]);
- u[10] = _mm_sub_epi32(v[2], v[10]);
- u[11] = _mm_sub_epi32(v[3], v[11]);
- u[12] = _mm_sub_epi32(v[4], v[12]);
- u[13] = _mm_sub_epi32(v[5], v[13]);
- u[14] = _mm_sub_epi32(v[6], v[14]);
- u[15] = _mm_sub_epi32(v[7], v[15]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
- x[0] = _mm_add_epi16(s[0], s[4]);
- x[1] = _mm_add_epi16(s[1], s[5]);
- x[2] = _mm_add_epi16(s[2], s[6]);
- x[3] = _mm_add_epi16(s[3], s[7]);
- x[4] = _mm_sub_epi16(s[0], s[4]);
- x[5] = _mm_sub_epi16(s[1], s[5]);
- x[6] = _mm_sub_epi16(s[2], s[6]);
- x[7] = _mm_sub_epi16(s[3], s[7]);
- x[8] = _mm_packs_epi32(u[0], u[1]);
- x[9] = _mm_packs_epi32(u[2], u[3]);
- x[10] = _mm_packs_epi32(u[4], u[5]);
- x[11] = _mm_packs_epi32(u[6], u[7]);
- x[12] = _mm_packs_epi32(u[8], u[9]);
- x[13] = _mm_packs_epi32(u[10], u[11]);
- x[14] = _mm_packs_epi32(u[12], u[13]);
- x[15] = _mm_packs_epi32(u[14], u[15]);
-
- // stage 3
- u[0] = _mm_unpacklo_epi16(x[4], x[5]);
- u[1] = _mm_unpackhi_epi16(x[4], x[5]);
- u[2] = _mm_unpacklo_epi16(x[6], x[7]);
- u[3] = _mm_unpackhi_epi16(x[6], x[7]);
- u[4] = _mm_unpacklo_epi16(x[12], x[13]);
- u[5] = _mm_unpackhi_epi16(x[12], x[13]);
- u[6] = _mm_unpacklo_epi16(x[14], x[15]);
- u[7] = _mm_unpackhi_epi16(x[14], x[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
-
- u[0] = _mm_add_epi32(v[0], v[4]);
- u[1] = _mm_add_epi32(v[1], v[5]);
- u[2] = _mm_add_epi32(v[2], v[6]);
- u[3] = _mm_add_epi32(v[3], v[7]);
- u[4] = _mm_sub_epi32(v[0], v[4]);
- u[5] = _mm_sub_epi32(v[1], v[5]);
- u[6] = _mm_sub_epi32(v[2], v[6]);
- u[7] = _mm_sub_epi32(v[3], v[7]);
- u[8] = _mm_add_epi32(v[8], v[12]);
- u[9] = _mm_add_epi32(v[9], v[13]);
- u[10] = _mm_add_epi32(v[10], v[14]);
- u[11] = _mm_add_epi32(v[11], v[15]);
- u[12] = _mm_sub_epi32(v[8], v[12]);
- u[13] = _mm_sub_epi32(v[9], v[13]);
- u[14] = _mm_sub_epi32(v[10], v[14]);
- u[15] = _mm_sub_epi32(v[11], v[15]);
-
- u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- s[0] = _mm_add_epi16(x[0], x[2]);
- s[1] = _mm_add_epi16(x[1], x[3]);
- s[2] = _mm_sub_epi16(x[0], x[2]);
- s[3] = _mm_sub_epi16(x[1], x[3]);
- s[4] = _mm_packs_epi32(v[0], v[1]);
- s[5] = _mm_packs_epi32(v[2], v[3]);
- s[6] = _mm_packs_epi32(v[4], v[5]);
- s[7] = _mm_packs_epi32(v[6], v[7]);
- s[8] = _mm_add_epi16(x[8], x[10]);
- s[9] = _mm_add_epi16(x[9], x[11]);
- s[10] = _mm_sub_epi16(x[8], x[10]);
- s[11] = _mm_sub_epi16(x[9], x[11]);
- s[12] = _mm_packs_epi32(v[8], v[9]);
- s[13] = _mm_packs_epi32(v[10], v[11]);
- s[14] = _mm_packs_epi32(v[12], v[13]);
- s[15] = _mm_packs_epi32(v[14], v[15]);
-
- // stage 4
- u[0] = _mm_unpacklo_epi16(s[2], s[3]);
- u[1] = _mm_unpackhi_epi16(s[2], s[3]);
- u[2] = _mm_unpacklo_epi16(s[6], s[7]);
- u[3] = _mm_unpackhi_epi16(s[6], s[7]);
- u[4] = _mm_unpacklo_epi16(s[10], s[11]);
- u[5] = _mm_unpackhi_epi16(s[10], s[11]);
- u[6] = _mm_unpacklo_epi16(s[14], s[15]);
- u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
- v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
- v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
- v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
- v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- in[0] = s[0];
- in[1] = _mm_sub_epi16(kZero, s[8]);
- in[2] = s[12];
- in[3] = _mm_sub_epi16(kZero, s[4]);
- in[4] = _mm_packs_epi32(v[4], v[5]);
- in[5] = _mm_packs_epi32(v[12], v[13]);
- in[6] = _mm_packs_epi32(v[8], v[9]);
- in[7] = _mm_packs_epi32(v[0], v[1]);
- in[8] = _mm_packs_epi32(v[2], v[3]);
- in[9] = _mm_packs_epi32(v[10], v[11]);
- in[10] = _mm_packs_epi32(v[14], v[15]);
- in[11] = _mm_packs_epi32(v[6], v[7]);
- in[12] = s[5];
- in[13] = _mm_sub_epi16(kZero, s[13]);
- in[14] = s[9];
- in[15] = _mm_sub_epi16(kZero, s[1]);
-}
-
-static void av1_idct16_8col(__m128i *in) {
- const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
- const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i v[16], u[16], s[16], t[16];
-
- // stage 1
- s[0] = in[0];
- s[1] = in[8];
- s[2] = in[4];
- s[3] = in[12];
- s[4] = in[2];
- s[5] = in[10];
- s[6] = in[6];
- s[7] = in[14];
- s[8] = in[1];
- s[9] = in[9];
- s[10] = in[5];
- s[11] = in[13];
- s[12] = in[3];
- s[13] = in[11];
- s[14] = in[7];
- s[15] = in[15];
-
- // stage 2
- u[0] = _mm_unpacklo_epi16(s[8], s[15]);
- u[1] = _mm_unpackhi_epi16(s[8], s[15]);
- u[2] = _mm_unpacklo_epi16(s[9], s[14]);
- u[3] = _mm_unpackhi_epi16(s[9], s[14]);
- u[4] = _mm_unpacklo_epi16(s[10], s[13]);
- u[5] = _mm_unpackhi_epi16(s[10], s[13]);
- u[6] = _mm_unpacklo_epi16(s[11], s[12]);
- u[7] = _mm_unpackhi_epi16(s[11], s[12]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
- v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
- v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- s[8] = _mm_packs_epi32(u[0], u[1]);
- s[15] = _mm_packs_epi32(u[2], u[3]);
- s[9] = _mm_packs_epi32(u[4], u[5]);
- s[14] = _mm_packs_epi32(u[6], u[7]);
- s[10] = _mm_packs_epi32(u[8], u[9]);
- s[13] = _mm_packs_epi32(u[10], u[11]);
- s[11] = _mm_packs_epi32(u[12], u[13]);
- s[12] = _mm_packs_epi32(u[14], u[15]);
-
- // stage 3
- t[0] = s[0];
- t[1] = s[1];
- t[2] = s[2];
- t[3] = s[3];
- u[0] = _mm_unpacklo_epi16(s[4], s[7]);
- u[1] = _mm_unpackhi_epi16(s[4], s[7]);
- u[2] = _mm_unpacklo_epi16(s[5], s[6]);
- u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- t[4] = _mm_packs_epi32(u[0], u[1]);
- t[7] = _mm_packs_epi32(u[2], u[3]);
- t[5] = _mm_packs_epi32(u[4], u[5]);
- t[6] = _mm_packs_epi32(u[6], u[7]);
- t[8] = _mm_add_epi16(s[8], s[9]);
- t[9] = _mm_sub_epi16(s[8], s[9]);
- t[10] = _mm_sub_epi16(s[11], s[10]);
- t[11] = _mm_add_epi16(s[10], s[11]);
- t[12] = _mm_add_epi16(s[12], s[13]);
- t[13] = _mm_sub_epi16(s[12], s[13]);
- t[14] = _mm_sub_epi16(s[15], s[14]);
- t[15] = _mm_add_epi16(s[14], s[15]);
-
- // stage 4
- u[0] = _mm_unpacklo_epi16(t[0], t[1]);
- u[1] = _mm_unpackhi_epi16(t[0], t[1]);
- u[2] = _mm_unpacklo_epi16(t[2], t[3]);
- u[3] = _mm_unpackhi_epi16(t[2], t[3]);
- u[4] = _mm_unpacklo_epi16(t[9], t[14]);
- u[5] = _mm_unpackhi_epi16(t[9], t[14]);
- u[6] = _mm_unpacklo_epi16(t[10], t[13]);
- u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
- v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
- v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
- v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
- v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- s[0] = _mm_packs_epi32(u[0], u[1]);
- s[1] = _mm_packs_epi32(u[2], u[3]);
- s[2] = _mm_packs_epi32(u[4], u[5]);
- s[3] = _mm_packs_epi32(u[6], u[7]);
- s[4] = _mm_add_epi16(t[4], t[5]);
- s[5] = _mm_sub_epi16(t[4], t[5]);
- s[6] = _mm_sub_epi16(t[7], t[6]);
- s[7] = _mm_add_epi16(t[6], t[7]);
- s[8] = t[8];
- s[15] = t[15];
- s[9] = _mm_packs_epi32(u[8], u[9]);
- s[14] = _mm_packs_epi32(u[10], u[11]);
- s[10] = _mm_packs_epi32(u[12], u[13]);
- s[13] = _mm_packs_epi32(u[14], u[15]);
- s[11] = t[11];
- s[12] = t[12];
-
- // stage 5
- t[0] = _mm_add_epi16(s[0], s[3]);
- t[1] = _mm_add_epi16(s[1], s[2]);
- t[2] = _mm_sub_epi16(s[1], s[2]);
- t[3] = _mm_sub_epi16(s[0], s[3]);
- t[4] = s[4];
- t[7] = s[7];
-
- u[0] = _mm_unpacklo_epi16(s[5], s[6]);
- u[1] = _mm_unpackhi_epi16(s[5], s[6]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- t[5] = _mm_packs_epi32(u[0], u[1]);
- t[6] = _mm_packs_epi32(u[2], u[3]);
-
- t[8] = _mm_add_epi16(s[8], s[11]);
- t[9] = _mm_add_epi16(s[9], s[10]);
- t[10] = _mm_sub_epi16(s[9], s[10]);
- t[11] = _mm_sub_epi16(s[8], s[11]);
- t[12] = _mm_sub_epi16(s[15], s[12]);
- t[13] = _mm_sub_epi16(s[14], s[13]);
- t[14] = _mm_add_epi16(s[13], s[14]);
- t[15] = _mm_add_epi16(s[12], s[15]);
-
- // stage 6
- s[0] = _mm_add_epi16(t[0], t[7]);
- s[1] = _mm_add_epi16(t[1], t[6]);
- s[2] = _mm_add_epi16(t[2], t[5]);
- s[3] = _mm_add_epi16(t[3], t[4]);
- s[4] = _mm_sub_epi16(t[3], t[4]);
- s[5] = _mm_sub_epi16(t[2], t[5]);
- s[6] = _mm_sub_epi16(t[1], t[6]);
- s[7] = _mm_sub_epi16(t[0], t[7]);
- s[8] = t[8];
- s[9] = t[9];
-
- u[0] = _mm_unpacklo_epi16(t[10], t[13]);
- u[1] = _mm_unpackhi_epi16(t[10], t[13]);
- u[2] = _mm_unpacklo_epi16(t[11], t[12]);
- u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- s[10] = _mm_packs_epi32(u[0], u[1]);
- s[13] = _mm_packs_epi32(u[2], u[3]);
- s[11] = _mm_packs_epi32(u[4], u[5]);
- s[12] = _mm_packs_epi32(u[6], u[7]);
- s[14] = t[14];
- s[15] = t[15];
-
- // stage 7
- in[0] = _mm_add_epi16(s[0], s[15]);
- in[1] = _mm_add_epi16(s[1], s[14]);
- in[2] = _mm_add_epi16(s[2], s[13]);
- in[3] = _mm_add_epi16(s[3], s[12]);
- in[4] = _mm_add_epi16(s[4], s[11]);
- in[5] = _mm_add_epi16(s[5], s[10]);
- in[6] = _mm_add_epi16(s[6], s[9]);
- in[7] = _mm_add_epi16(s[7], s[8]);
- in[8] = _mm_sub_epi16(s[7], s[8]);
- in[9] = _mm_sub_epi16(s[6], s[9]);
- in[10] = _mm_sub_epi16(s[5], s[10]);
- in[11] = _mm_sub_epi16(s[4], s[11]);
- in[12] = _mm_sub_epi16(s[3], s[12]);
- in[13] = _mm_sub_epi16(s[2], s[13]);
- in[14] = _mm_sub_epi16(s[1], s[14]);
- in[15] = _mm_sub_epi16(s[0], s[15]);
-}
-
-void av1_idct16_sse2(__m128i *in0, __m128i *in1) {
- array_transpose_16x16(in0, in1);
- av1_idct16_8col(in0);
- av1_idct16_8col(in1);
-}
-
-void av1_iadst16_sse2(__m128i *in0, __m128i *in1) {
- array_transpose_16x16(in0, in1);
- av1_iadst16_8col(in0);
- av1_iadst16_8col(in1);
-}
-
-void av1_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
- int stride) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- const __m128i zero = _mm_setzero_si128();
-
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- __m128i in[16], l[16];
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
- stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
- stp1_12_0;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i;
- // First 1-D inverse DCT
- // Load input data.
- in[0] = _mm_load_si128((const __m128i *)input);
- in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
- in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
- in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
-
- TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
-
- // Stage2
- {
- const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
- const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
-
- tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
- tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
- tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
- tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp5 = _mm_add_epi32(tmp5, rounding);
- tmp7 = _mm_add_epi32(tmp7, rounding);
-
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
- stp2_8 = _mm_packs_epi32(tmp0, tmp2);
- stp2_11 = _mm_packs_epi32(tmp5, tmp7);
- }
-
- // Stage3
- {
- const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
-
- tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
- tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
- stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
- stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
-
- stp1_4 = _mm_packs_epi32(tmp0, tmp2);
- }
-
- // Stage4
- {
- const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
-
- tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
- tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
- tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
- tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
- tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
- tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp1 = _mm_add_epi32(tmp1, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
- tmp5 = _mm_add_epi32(tmp5, rounding);
- tmp7 = _mm_add_epi32(tmp7, rounding);
-
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
- stp1_0 = _mm_packs_epi32(tmp0, tmp0);
- stp1_1 = _mm_packs_epi32(tmp2, tmp2);
- stp2_9 = _mm_packs_epi32(tmp1, tmp3);
- stp2_10 = _mm_packs_epi32(tmp5, tmp7);
-
- stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
- }
-
- // Stage5 and Stage6
- {
- tmp0 = _mm_add_epi16(stp2_8, stp2_11);
- tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
- tmp2 = _mm_add_epi16(stp2_9, stp2_10);
- tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
-
- stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
- stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
- stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
- stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
-
- stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
- stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
- stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
- stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
- }
-
- // Stage6
- {
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
-
- tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
- tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
- tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
- tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
- tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
- tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
-
- tmp1 = _mm_add_epi32(tmp1, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
-
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
- stp1_6 = _mm_packs_epi32(tmp3, tmp1);
-
- stp2_10 = _mm_packs_epi32(tmp0, zero);
- stp2_13 = _mm_packs_epi32(tmp2, zero);
- stp2_11 = _mm_packs_epi32(tmp4, zero);
- stp2_12 = _mm_packs_epi32(tmp6, zero);
-
- tmp0 = _mm_add_epi16(stp1_0, stp1_4);
- tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
- tmp2 = _mm_add_epi16(stp1_1, stp1_6);
- tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
-
- stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
- stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
- stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
- stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
- stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
- stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
- stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
- stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
- }
-
- // Stage7. Left 8x16 only.
- l[0] = _mm_add_epi16(stp2_0, stp1_15);
- l[1] = _mm_add_epi16(stp2_1, stp1_14);
- l[2] = _mm_add_epi16(stp2_2, stp2_13);
- l[3] = _mm_add_epi16(stp2_3, stp2_12);
- l[4] = _mm_add_epi16(stp2_4, stp2_11);
- l[5] = _mm_add_epi16(stp2_5, stp2_10);
- l[6] = _mm_add_epi16(stp2_6, stp1_9);
- l[7] = _mm_add_epi16(stp2_7, stp1_8);
- l[8] = _mm_sub_epi16(stp2_7, stp1_8);
- l[9] = _mm_sub_epi16(stp2_6, stp1_9);
- l[10] = _mm_sub_epi16(stp2_5, stp2_10);
- l[11] = _mm_sub_epi16(stp2_4, stp2_11);
- l[12] = _mm_sub_epi16(stp2_3, stp2_12);
- l[13] = _mm_sub_epi16(stp2_2, stp2_13);
- l[14] = _mm_sub_epi16(stp2_1, stp1_14);
- l[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
- // Second 1-D inverse transform, performed per 8x16 block
- for (i = 0; i < 2; i++) {
- int j;
- array_transpose_4X8(l + 8 * i, in);
-
- IDCT16_10
-
- // Stage7
- in[0] = _mm_add_epi16(stp2_0, stp1_15);
- in[1] = _mm_add_epi16(stp2_1, stp1_14);
- in[2] = _mm_add_epi16(stp2_2, stp2_13);
- in[3] = _mm_add_epi16(stp2_3, stp2_12);
- in[4] = _mm_add_epi16(stp2_4, stp2_11);
- in[5] = _mm_add_epi16(stp2_5, stp2_10);
- in[6] = _mm_add_epi16(stp2_6, stp1_9);
- in[7] = _mm_add_epi16(stp2_7, stp1_8);
- in[8] = _mm_sub_epi16(stp2_7, stp1_8);
- in[9] = _mm_sub_epi16(stp2_6, stp1_9);
- in[10] = _mm_sub_epi16(stp2_5, stp2_10);
- in[11] = _mm_sub_epi16(stp2_4, stp2_11);
- in[12] = _mm_sub_epi16(stp2_3, stp2_12);
- in[13] = _mm_sub_epi16(stp2_2, stp2_13);
- in[14] = _mm_sub_epi16(stp2_1, stp1_14);
- in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
- for (j = 0; j < 16; ++j) {
- // Final rounding and shift
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j] = _mm_srai_epi16(in[j], 6);
- RECON_AND_STORE(dest + j * stride, in[j]);
- }
-
- dest += 8;
- }
-}
-
-#define LOAD_DQCOEFF(reg, input) \
- { \
- reg = _mm_load_si128((const __m128i *)input); \
- input += 8; \
- }
-
-#define IDCT32_34 \
- /* Stage1 */ \
- { \
- const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
- const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
- \
- const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]); \
- const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
- \
- const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
- const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
- \
- const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
- const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
- \
- MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16, \
- stp1_31); \
- MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19, \
- stp1_28); \
- MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20, \
- stp1_27); \
- MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23, \
- stp1_24); \
- } \
- \
- /* Stage2 */ \
- { \
- const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
- const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
- \
- const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
- const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
- \
- MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8, \
- stp2_15); \
- MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11, \
- stp2_12); \
- \
- stp2_16 = stp1_16; \
- stp2_19 = stp1_19; \
- \
- stp2_20 = stp1_20; \
- stp2_23 = stp1_23; \
- \
- stp2_24 = stp1_24; \
- stp2_27 = stp1_27; \
- \
- stp2_28 = stp1_28; \
- stp2_31 = stp1_31; \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
- const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
- \
- const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
- const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
- \
- MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4, \
- stp1_7); \
- \
- stp1_8 = stp2_8; \
- stp1_11 = stp2_11; \
- stp1_12 = stp2_12; \
- stp1_15 = stp2_15; \
- \
- MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
- stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \
- stp1_29) \
- MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
- stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
- stp1_25) \
- \
- stp1_16 = stp2_16; \
- stp1_31 = stp2_31; \
- stp1_19 = stp2_19; \
- stp1_20 = stp2_20; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_27 = stp2_27; \
- stp1_28 = stp2_28; \
- } \
- \
- /* Stage4 */ \
- { \
- const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
- const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
- \
- MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0, \
- stp2_1); \
- \
- stp2_4 = stp1_4; \
- stp2_5 = stp1_4; \
- stp2_6 = stp1_7; \
- stp2_7 = stp1_7; \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
- stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \
- stp2_13) \
- \
- stp2_8 = stp1_8; \
- stp2_15 = stp1_15; \
- stp2_11 = stp1_11; \
- stp2_12 = stp1_12; \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
- stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
- stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
- stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
- stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
- stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
- \
- stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
- stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
- stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
- stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
- stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
- } \
- \
- /* Stage5 */ \
- { \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
- \
- const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
- const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- stp1_0 = stp2_0; \
- stp1_1 = stp2_1; \
- stp1_2 = stp2_1; \
- stp1_3 = stp2_0; \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_4 = stp2_4; \
- stp1_7 = stp2_7; \
- \
- stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- \
- MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
- stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \
- stp1_28) \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
- stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \
- stp1_26) \
- \
- stp1_22 = stp2_22; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_25 = stp2_25; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
- } \
- \
- /* Stage6 */ \
- { \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
- \
- stp2_8 = stp1_8; \
- stp2_9 = stp1_9; \
- stp2_14 = stp1_14; \
- stp2_15 = stp1_15; \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
- stp2_12) \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
- stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
- stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
- stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
- stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
- stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
- \
- stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
- stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
- stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
- stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
- stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
- } \
- \
- /* Stage7 */ \
- { \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
- const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
- const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
- stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
- stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
- stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
- stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
- stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
- stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
- stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
- stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
- stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- stp1_18 = stp2_18; \
- stp1_19 = stp2_19; \
- \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \
- stp1_26) \
- MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \
- stp1_24) \
- \
- stp1_28 = stp2_28; \
- stp1_29 = stp2_29; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
- }
-
-#define IDCT32 \
- /* Stage1 */ \
- { \
- const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
- const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
- const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
- const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
- \
- const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
- const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
- const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]); \
- const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
- \
- const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
- const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
- const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
- const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
- \
- const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
- const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
- const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
- const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
- \
- MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
- stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \
- stp1_30) \
- MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
- stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
- MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
- stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
- stp1_21, stp1_26) \
- MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
- stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
- stp1_23, stp1_24) \
- } \
- \
- /* Stage2 */ \
- { \
- const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
- const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
- const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
- const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
- \
- const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
- const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
- const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
- const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
- \
- MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
- stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
- stp2_14) \
- MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
- stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, \
- stp2_12) \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
- stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
- stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
- stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
- \
- stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
- stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
- stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
- stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
- \
- stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
- stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
- stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
- stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
- \
- stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
- stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
- stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
- const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
- const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
- const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
- \
- const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
- const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
- \
- MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
- stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
- stp1_6) \
- \
- stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
- stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
- stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
- stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
- stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
- \
- MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
- stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \
- stp1_29) \
- MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
- stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
- stp1_25) \
- \
- stp1_16 = stp2_16; \
- stp1_31 = stp2_31; \
- stp1_19 = stp2_19; \
- stp1_20 = stp2_20; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_27 = stp2_27; \
- stp1_28 = stp2_28; \
- } \
- \
- /* Stage4 */ \
- { \
- const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
- const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
- const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
- const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- \
- MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
- stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \
- \
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
- stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \
- stp2_13) \
- \
- stp2_8 = stp1_8; \
- stp2_15 = stp1_15; \
- stp2_11 = stp1_11; \
- stp2_12 = stp1_12; \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
- stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
- stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
- stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
- stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
- stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
- \
- stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
- stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
- stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
- stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
- stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
- } \
- \
- /* Stage5 */ \
- { \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
- \
- const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
- const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_4 = stp2_4; \
- stp1_7 = stp2_7; \
- \
- stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- \
- MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
- stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \
- stp1_28) \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
- stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \
- stp1_26) \
- \
- stp1_22 = stp2_22; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_25 = stp2_25; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
- } \
- \
- /* Stage6 */ \
- { \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
- \
- stp2_8 = stp1_8; \
- stp2_9 = stp1_9; \
- stp2_14 = stp1_14; \
- stp2_15 = stp1_15; \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
- stp2_12) \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
- stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
- stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
- stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
- stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
- stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
- \
- stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
- stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
- stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
- stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
- stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
- } \
- \
- /* Stage7 */ \
- { \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
- const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
- const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
- stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
- stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
- stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
- stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
- stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
- stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
- stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
- stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
- stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- stp1_18 = stp2_18; \
- stp1_19 = stp2_19; \
- \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \
- stp1_26) \
- MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \
- stp1_24) \
- \
- stp1_28 = stp2_28; \
- stp1_29 = stp2_29; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
- }
-
-// Only upper-left 8x8 has non-zero coeff
-void av1_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
- int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-
- // av1_idct constants for each stage
- const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
- const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
- const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
- const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
- const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
- const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
- const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
- const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
- __m128i in[32], col[32];
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
- stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
- stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
- stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i;
-
- // Load input data. Only need to load the top left 8x8 block.
- in[0] = _mm_load_si128((const __m128i *)input);
- in[1] = _mm_load_si128((const __m128i *)(input + 32));
- in[2] = _mm_load_si128((const __m128i *)(input + 64));
- in[3] = _mm_load_si128((const __m128i *)(input + 96));
- in[4] = _mm_load_si128((const __m128i *)(input + 128));
- in[5] = _mm_load_si128((const __m128i *)(input + 160));
- in[6] = _mm_load_si128((const __m128i *)(input + 192));
- in[7] = _mm_load_si128((const __m128i *)(input + 224));
-
- for (i = 8; i < 32; ++i) {
- in[i] = _mm_setzero_si128();
- }
-
- array_transpose_8x8(in, in);
- // TODO(hkuang): Following transposes are unnecessary. But remove them will
- // lead to performance drop on some devices.
- array_transpose_8x8(in + 8, in + 8);
- array_transpose_8x8(in + 16, in + 16);
- array_transpose_8x8(in + 24, in + 24);
-
- IDCT32_34
-
- // 1_D: Store 32 intermediate results for each 8x32 block.
- col[0] = _mm_add_epi16(stp1_0, stp1_31);
- col[1] = _mm_add_epi16(stp1_1, stp1_30);
- col[2] = _mm_add_epi16(stp1_2, stp1_29);
- col[3] = _mm_add_epi16(stp1_3, stp1_28);
- col[4] = _mm_add_epi16(stp1_4, stp1_27);
- col[5] = _mm_add_epi16(stp1_5, stp1_26);
- col[6] = _mm_add_epi16(stp1_6, stp1_25);
- col[7] = _mm_add_epi16(stp1_7, stp1_24);
- col[8] = _mm_add_epi16(stp1_8, stp1_23);
- col[9] = _mm_add_epi16(stp1_9, stp1_22);
- col[10] = _mm_add_epi16(stp1_10, stp1_21);
- col[11] = _mm_add_epi16(stp1_11, stp1_20);
- col[12] = _mm_add_epi16(stp1_12, stp1_19);
- col[13] = _mm_add_epi16(stp1_13, stp1_18);
- col[14] = _mm_add_epi16(stp1_14, stp1_17);
- col[15] = _mm_add_epi16(stp1_15, stp1_16);
- col[16] = _mm_sub_epi16(stp1_15, stp1_16);
- col[17] = _mm_sub_epi16(stp1_14, stp1_17);
- col[18] = _mm_sub_epi16(stp1_13, stp1_18);
- col[19] = _mm_sub_epi16(stp1_12, stp1_19);
- col[20] = _mm_sub_epi16(stp1_11, stp1_20);
- col[21] = _mm_sub_epi16(stp1_10, stp1_21);
- col[22] = _mm_sub_epi16(stp1_9, stp1_22);
- col[23] = _mm_sub_epi16(stp1_8, stp1_23);
- col[24] = _mm_sub_epi16(stp1_7, stp1_24);
- col[25] = _mm_sub_epi16(stp1_6, stp1_25);
- col[26] = _mm_sub_epi16(stp1_5, stp1_26);
- col[27] = _mm_sub_epi16(stp1_4, stp1_27);
- col[28] = _mm_sub_epi16(stp1_3, stp1_28);
- col[29] = _mm_sub_epi16(stp1_2, stp1_29);
- col[30] = _mm_sub_epi16(stp1_1, stp1_30);
- col[31] = _mm_sub_epi16(stp1_0, stp1_31);
- for (i = 0; i < 4; i++) {
- int j;
- // Transpose 32x8 block to 8x32 block
- array_transpose_8x8(col + i * 8, in);
- IDCT32_34
-
- // 2_D: Calculate the results and store them to destination.
- in[0] = _mm_add_epi16(stp1_0, stp1_31);
- in[1] = _mm_add_epi16(stp1_1, stp1_30);
- in[2] = _mm_add_epi16(stp1_2, stp1_29);
- in[3] = _mm_add_epi16(stp1_3, stp1_28);
- in[4] = _mm_add_epi16(stp1_4, stp1_27);
- in[5] = _mm_add_epi16(stp1_5, stp1_26);
- in[6] = _mm_add_epi16(stp1_6, stp1_25);
- in[7] = _mm_add_epi16(stp1_7, stp1_24);
- in[8] = _mm_add_epi16(stp1_8, stp1_23);
- in[9] = _mm_add_epi16(stp1_9, stp1_22);
- in[10] = _mm_add_epi16(stp1_10, stp1_21);
- in[11] = _mm_add_epi16(stp1_11, stp1_20);
- in[12] = _mm_add_epi16(stp1_12, stp1_19);
- in[13] = _mm_add_epi16(stp1_13, stp1_18);
- in[14] = _mm_add_epi16(stp1_14, stp1_17);
- in[15] = _mm_add_epi16(stp1_15, stp1_16);
- in[16] = _mm_sub_epi16(stp1_15, stp1_16);
- in[17] = _mm_sub_epi16(stp1_14, stp1_17);
- in[18] = _mm_sub_epi16(stp1_13, stp1_18);
- in[19] = _mm_sub_epi16(stp1_12, stp1_19);
- in[20] = _mm_sub_epi16(stp1_11, stp1_20);
- in[21] = _mm_sub_epi16(stp1_10, stp1_21);
- in[22] = _mm_sub_epi16(stp1_9, stp1_22);
- in[23] = _mm_sub_epi16(stp1_8, stp1_23);
- in[24] = _mm_sub_epi16(stp1_7, stp1_24);
- in[25] = _mm_sub_epi16(stp1_6, stp1_25);
- in[26] = _mm_sub_epi16(stp1_5, stp1_26);
- in[27] = _mm_sub_epi16(stp1_4, stp1_27);
- in[28] = _mm_sub_epi16(stp1_3, stp1_28);
- in[29] = _mm_sub_epi16(stp1_2, stp1_29);
- in[30] = _mm_sub_epi16(stp1_1, stp1_30);
- in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
- for (j = 0; j < 32; ++j) {
- // Final rounding and shift
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j] = _mm_srai_epi16(in[j], 6);
- RECON_AND_STORE(dest + j * stride, in[j]);
- }
-
- dest += 8;
- }
-}
-
-void av1_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
- int stride) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- const __m128i zero = _mm_setzero_si128();
-
- // av1_idct constants for each stage
- const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
- const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
- const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
- const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
- const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
- const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
- const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
- const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
- const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
- const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
- const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
- const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
- const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
- const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
- const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
- const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
- __m128i in[32], col[128], zero_idx[16];
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
- stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
- stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
- stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i, j, i32;
-
- for (i = 0; i < 4; i++) {
- i32 = (i << 5);
- // First 1-D av1_idct
- // Load input data.
- LOAD_DQCOEFF(in[0], input);
- LOAD_DQCOEFF(in[8], input);
- LOAD_DQCOEFF(in[16], input);
- LOAD_DQCOEFF(in[24], input);
- LOAD_DQCOEFF(in[1], input);
- LOAD_DQCOEFF(in[9], input);
- LOAD_DQCOEFF(in[17], input);
- LOAD_DQCOEFF(in[25], input);
- LOAD_DQCOEFF(in[2], input);
- LOAD_DQCOEFF(in[10], input);
- LOAD_DQCOEFF(in[18], input);
- LOAD_DQCOEFF(in[26], input);
- LOAD_DQCOEFF(in[3], input);
- LOAD_DQCOEFF(in[11], input);
- LOAD_DQCOEFF(in[19], input);
- LOAD_DQCOEFF(in[27], input);
-
- LOAD_DQCOEFF(in[4], input);
- LOAD_DQCOEFF(in[12], input);
- LOAD_DQCOEFF(in[20], input);
- LOAD_DQCOEFF(in[28], input);
- LOAD_DQCOEFF(in[5], input);
- LOAD_DQCOEFF(in[13], input);
- LOAD_DQCOEFF(in[21], input);
- LOAD_DQCOEFF(in[29], input);
- LOAD_DQCOEFF(in[6], input);
- LOAD_DQCOEFF(in[14], input);
- LOAD_DQCOEFF(in[22], input);
- LOAD_DQCOEFF(in[30], input);
- LOAD_DQCOEFF(in[7], input);
- LOAD_DQCOEFF(in[15], input);
- LOAD_DQCOEFF(in[23], input);
- LOAD_DQCOEFF(in[31], input);
-
- // checking if all entries are zero
- zero_idx[0] = _mm_or_si128(in[0], in[1]);
- zero_idx[1] = _mm_or_si128(in[2], in[3]);
- zero_idx[2] = _mm_or_si128(in[4], in[5]);
- zero_idx[3] = _mm_or_si128(in[6], in[7]);
- zero_idx[4] = _mm_or_si128(in[8], in[9]);
- zero_idx[5] = _mm_or_si128(in[10], in[11]);
- zero_idx[6] = _mm_or_si128(in[12], in[13]);
- zero_idx[7] = _mm_or_si128(in[14], in[15]);
- zero_idx[8] = _mm_or_si128(in[16], in[17]);
- zero_idx[9] = _mm_or_si128(in[18], in[19]);
- zero_idx[10] = _mm_or_si128(in[20], in[21]);
- zero_idx[11] = _mm_or_si128(in[22], in[23]);
- zero_idx[12] = _mm_or_si128(in[24], in[25]);
- zero_idx[13] = _mm_or_si128(in[26], in[27]);
- zero_idx[14] = _mm_or_si128(in[28], in[29]);
- zero_idx[15] = _mm_or_si128(in[30], in[31]);
-
- zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
- zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
- zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
- zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
- zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
- zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
- zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
- zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
-
- zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
- zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
- zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
- zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
- zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
- zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
- zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-
- if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
- col[i32 + 0] = _mm_setzero_si128();
- col[i32 + 1] = _mm_setzero_si128();
- col[i32 + 2] = _mm_setzero_si128();
- col[i32 + 3] = _mm_setzero_si128();
- col[i32 + 4] = _mm_setzero_si128();
- col[i32 + 5] = _mm_setzero_si128();
- col[i32 + 6] = _mm_setzero_si128();
- col[i32 + 7] = _mm_setzero_si128();
- col[i32 + 8] = _mm_setzero_si128();
- col[i32 + 9] = _mm_setzero_si128();
- col[i32 + 10] = _mm_setzero_si128();
- col[i32 + 11] = _mm_setzero_si128();
- col[i32 + 12] = _mm_setzero_si128();
- col[i32 + 13] = _mm_setzero_si128();
- col[i32 + 14] = _mm_setzero_si128();
- col[i32 + 15] = _mm_setzero_si128();
- col[i32 + 16] = _mm_setzero_si128();
- col[i32 + 17] = _mm_setzero_si128();
- col[i32 + 18] = _mm_setzero_si128();
- col[i32 + 19] = _mm_setzero_si128();
- col[i32 + 20] = _mm_setzero_si128();
- col[i32 + 21] = _mm_setzero_si128();
- col[i32 + 22] = _mm_setzero_si128();
- col[i32 + 23] = _mm_setzero_si128();
- col[i32 + 24] = _mm_setzero_si128();
- col[i32 + 25] = _mm_setzero_si128();
- col[i32 + 26] = _mm_setzero_si128();
- col[i32 + 27] = _mm_setzero_si128();
- col[i32 + 28] = _mm_setzero_si128();
- col[i32 + 29] = _mm_setzero_si128();
- col[i32 + 30] = _mm_setzero_si128();
- col[i32 + 31] = _mm_setzero_si128();
- continue;
- }
-
- // Transpose 32x8 block to 8x32 block
- array_transpose_8x8(in, in);
- array_transpose_8x8(in + 8, in + 8);
- array_transpose_8x8(in + 16, in + 16);
- array_transpose_8x8(in + 24, in + 24);
-
- IDCT32
-
- // 1_D: Store 32 intermediate results for each 8x32 block.
- col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
- col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
- col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
- col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
- col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
- col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
- col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
- col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
- col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
- col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
- col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
- col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
- col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
- col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
- col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
- col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
- col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
- col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
- col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
- col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
- col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
- col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
- col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
- col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
- col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
- col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
- col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
- col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
- col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
- col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
- col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
- col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
- }
- for (i = 0; i < 4; i++) {
- // Second 1-D av1_idct
- j = i << 3;
-
- // Transpose 32x8 block to 8x32 block
- array_transpose_8x8(col + j, in);
- array_transpose_8x8(col + j + 32, in + 8);
- array_transpose_8x8(col + j + 64, in + 16);
- array_transpose_8x8(col + j + 96, in + 24);
-
- IDCT32
-
- // 2_D: Calculate the results and store them to destination.
- in[0] = _mm_add_epi16(stp1_0, stp1_31);
- in[1] = _mm_add_epi16(stp1_1, stp1_30);
- in[2] = _mm_add_epi16(stp1_2, stp1_29);
- in[3] = _mm_add_epi16(stp1_3, stp1_28);
- in[4] = _mm_add_epi16(stp1_4, stp1_27);
- in[5] = _mm_add_epi16(stp1_5, stp1_26);
- in[6] = _mm_add_epi16(stp1_6, stp1_25);
- in[7] = _mm_add_epi16(stp1_7, stp1_24);
- in[8] = _mm_add_epi16(stp1_8, stp1_23);
- in[9] = _mm_add_epi16(stp1_9, stp1_22);
- in[10] = _mm_add_epi16(stp1_10, stp1_21);
- in[11] = _mm_add_epi16(stp1_11, stp1_20);
- in[12] = _mm_add_epi16(stp1_12, stp1_19);
- in[13] = _mm_add_epi16(stp1_13, stp1_18);
- in[14] = _mm_add_epi16(stp1_14, stp1_17);
- in[15] = _mm_add_epi16(stp1_15, stp1_16);
- in[16] = _mm_sub_epi16(stp1_15, stp1_16);
- in[17] = _mm_sub_epi16(stp1_14, stp1_17);
- in[18] = _mm_sub_epi16(stp1_13, stp1_18);
- in[19] = _mm_sub_epi16(stp1_12, stp1_19);
- in[20] = _mm_sub_epi16(stp1_11, stp1_20);
- in[21] = _mm_sub_epi16(stp1_10, stp1_21);
- in[22] = _mm_sub_epi16(stp1_9, stp1_22);
- in[23] = _mm_sub_epi16(stp1_8, stp1_23);
- in[24] = _mm_sub_epi16(stp1_7, stp1_24);
- in[25] = _mm_sub_epi16(stp1_6, stp1_25);
- in[26] = _mm_sub_epi16(stp1_5, stp1_26);
- in[27] = _mm_sub_epi16(stp1_4, stp1_27);
- in[28] = _mm_sub_epi16(stp1_3, stp1_28);
- in[29] = _mm_sub_epi16(stp1_2, stp1_29);
- in[30] = _mm_sub_epi16(stp1_1, stp1_30);
- in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
- for (j = 0; j < 32; ++j) {
- // Final rounding and shift
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j] = _mm_srai_epi16(in[j], 6);
- RECON_AND_STORE(dest + j * stride, in[j]);
- }
-
- dest += 8;
- }
-}
-
-void av1_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
- __m128i dc_value;
- const __m128i zero = _mm_setzero_si128();
- int a, i;
-
- a = (int)dct_const_round_shift(input[0] * cospi_16_64);
- a = (int)dct_const_round_shift(a * cospi_16_64);
- a = ROUND_POWER_OF_TWO(a, 6);
-
- dc_value = _mm_set1_epi16(a);
-
- for (i = 0; i < 4; ++i) {
- int j;
- for (j = 0; j < 32; ++j) {
- RECON_AND_STORE(dest + j * stride, dc_value);
- }
- dest += 8;
- }
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
- __m128i ubounded, retval;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi16(1);
- const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
- ubounded = _mm_cmpgt_epi16(value, max);
- retval = _mm_andnot_si128(ubounded, value);
- ubounded = _mm_and_si128(ubounded, max);
- retval = _mm_or_si128(retval, ubounded);
- retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
- return retval;
-}
-
-void av1_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[4 * 4];
- tran_low_t *outptr = out;
- int i, j;
- __m128i inptr[4];
- __m128i sign_bits[2];
- __m128i temp_mm, min_input, max_input;
- int test;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- int optimised_cols = 0;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i max = _mm_set1_epi16(12043);
- const __m128i min = _mm_set1_epi16(-12043);
- // Load input into __m128i
- inptr[0] = _mm_loadu_si128((const __m128i *)input);
- inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
- inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
- inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
-
- // Pack to 16 bits
- inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
- inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
-
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp_mm = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp_mm);
-
- if (!test) {
- // Do the row transform
- av1_idct4_sse2(inptr);
-
- // Check the min & max values
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp_mm = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp_mm);
-
- if (test) {
- transpose_4x4(inptr);
- sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
- sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
- inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
- inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
- inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
- inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
- _mm_storeu_si128((__m128i *)outptr, inptr[0]);
- _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
- _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
- _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 4; ++i) {
- av1_highbd_idct4_c(input, outptr, bd);
- input += 4;
- outptr += 4;
- }
- }
-
- if (optimised_cols) {
- av1_idct4_sse2(inptr);
-
- // Final round and shift
- inptr[0] = _mm_add_epi16(inptr[0], eight);
- inptr[1] = _mm_add_epi16(inptr[1], eight);
-
- inptr[0] = _mm_srai_epi16(inptr[0], 4);
- inptr[1] = _mm_srai_epi16(inptr[1], 4);
-
- // Reconstruction and Store
- {
- __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
- __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
- d0 = _mm_unpacklo_epi64(
- d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
- d2 = _mm_unpacklo_epi64(
- d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
- d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
- d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
- // store input0
- _mm_storel_epi64((__m128i *)dest, d0);
- // store input1
- d0 = _mm_srli_si128(d0, 8);
- _mm_storel_epi64((__m128i *)(dest + stride), d0);
- // store input2
- _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
- // store input3
- d2 = _mm_srli_si128(d2, 8);
- _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[4], temp_out[4];
- // Columns
- for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
- av1_highbd_idct4_c(temp_in, temp_out, bd);
- for (j = 0; j < 4; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
- }
- }
- }
-}
-
-void av1_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[8 * 8];
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[8];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i sixteen = _mm_set1_epi16(16);
- const __m128i max = _mm_set1_epi16(6201);
- const __m128i min = _mm_set1_epi16(-6201);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 8; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- }
-
- // Find the min & max for the row transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 8; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform
- av1_idct8_sse2(inptr);
-
- // Find the min & max for the column transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 8; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- array_transpose_8x8(inptr, inptr);
- for (i = 0; i < 8; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
- }
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 8; ++i) {
- av1_highbd_idct8_c(input, outptr, bd);
- input += 8;
- outptr += 8;
- }
- }
-
- if (optimised_cols) {
- av1_idct8_sse2(inptr);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[8];
- for (i = 0; i < 8; i++) {
- inptr[i] = _mm_add_epi16(inptr[i], sixteen);
- d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
- inptr[i] = _mm_srai_epi16(inptr[i], 5);
- d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
- }
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[8], temp_out[8];
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- av1_highbd_idct8_c(temp_in, temp_out, bd);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
- }
- }
- }
-}
-
-void av1_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[8 * 8] = { 0 };
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[8];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i sixteen = _mm_set1_epi16(16);
- const __m128i max = _mm_set1_epi16(6201);
- const __m128i min = _mm_set1_epi16(-6201);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 8; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- }
-
- // Find the min & max for the row transform
- // only first 4 row has non-zero coefs
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 4; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform
- av1_idct8_sse2(inptr);
-
- // Find the min & max for the column transform
- // N.B. Only first 4 cols contain non-zero coeffs
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 8; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- // Use fact only first 4 rows contain non-zero coeffs
- array_transpose_4X8(inptr, inptr);
- for (i = 0; i < 4; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
- }
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 4; ++i) {
- av1_highbd_idct8_c(input, outptr, bd);
- input += 8;
- outptr += 8;
- }
- }
-
- if (optimised_cols) {
- av1_idct8_sse2(inptr);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[8];
- for (i = 0; i < 8; i++) {
- inptr[i] = _mm_add_epi16(inptr[i], sixteen);
- d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
- inptr[i] = _mm_srai_epi16(inptr[i], 5);
- d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
- }
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[8], temp_out[8];
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- av1_highbd_idct8_c(temp_in, temp_out, bd);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
- }
- }
- }
-}
-
-void av1_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[16 * 16];
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[32];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i rounding = _mm_set1_epi16(32);
- const __m128i max = _mm_set1_epi16(3155);
- const __m128i min = _mm_set1_epi16(-3155);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 16; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
- inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
- }
-
- // Find the min & max for the row transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 32; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform
- av1_idct16_sse2(inptr, inptr + 16);
-
- // Find the min & max for the column transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 32; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- array_transpose_16x16(inptr, inptr + 16);
- for (i = 0; i < 16; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
- sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
- }
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 16; ++i) {
- av1_highbd_idct16_c(input, outptr, bd);
- input += 16;
- outptr += 16;
- }
- }
-
- if (optimised_cols) {
- av1_idct16_sse2(inptr, inptr + 16);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[2];
- for (i = 0; i < 16; i++) {
- inptr[i] = _mm_add_epi16(inptr[i], rounding);
- inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
- d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
- d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
- inptr[i] = _mm_srai_epi16(inptr[i], 6);
- inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
- d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
- d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
- _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
- }
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[16], temp_out[16];
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- av1_highbd_idct16_c(temp_in, temp_out, bd);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
- }
-}
-
-void av1_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[16 * 16] = { 0 };
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[32];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i rounding = _mm_set1_epi16(32);
- const __m128i max = _mm_set1_epi16(3155);
- const __m128i min = _mm_set1_epi16(-3155);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 16; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
- inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
- }
-
- // Find the min & max for the row transform
- // Since all non-zero dct coefficients are in upper-left 4x4 area,
- // we only need to consider first 4 rows here.
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 4; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform (N.B. This transposes inptr)
- av1_idct16_sse2(inptr, inptr + 16);
-
- // Find the min & max for the column transform
- // N.B. Only first 4 cols contain non-zero coeffs
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 16; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- // Use fact only first 4 rows contain non-zero coeffs
- array_transpose_8x8(inptr, inptr);
- array_transpose_8x8(inptr + 8, inptr + 16);
- for (i = 0; i < 4; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
- sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
- }
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 4; ++i) {
- av1_highbd_idct16_c(input, outptr, bd);
- input += 16;
- outptr += 16;
- }
- }
-
- if (optimised_cols) {
- av1_idct16_sse2(inptr, inptr + 16);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[2];
- for (i = 0; i < 16; i++) {
- inptr[i] = _mm_add_epi16(inptr[i], rounding);
- inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
- d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
- d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
- inptr[i] = _mm_srai_epi16(inptr[i], 6);
- inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
- d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
- d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
- _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
- }
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[16], temp_out[16];
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- av1_highbd_idct16_c(temp_in, temp_out, bd);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
- }
-}
-#endif // CONFIG_AOM_HIGHBITDEPTH
diff --git a/av1/common/x86/av1_inv_txfm_sse2.h b/av1/common/x86/av1_inv_txfm_sse2.h
deleted file mode 100644
index a8bb6c1..0000000
--- a/av1/common/x86/av1_inv_txfm_sse2.h
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_INV_TXFM_SSE2_H_
-#define AOM_DSP_X86_INV_TXFM_SSE2_H_
-
-#include <emmintrin.h> // SSE2
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "av1/common/av1_inv_txfm.h"
-
-// perform 8x8 transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
- const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
- const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
- const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-
- res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
- res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
- res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
- res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
- res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
- res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
- res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
- res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-}
-
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- \
- in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
- in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
- }
-
-static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-
- out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
- out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
- out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
- out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
-}
-
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
- __m128i tbuf[8];
- array_transpose_8x8(res0, res0);
- array_transpose_8x8(res1, tbuf);
- array_transpose_8x8(res0 + 8, res1);
- array_transpose_8x8(res1 + 8, res1 + 8);
-
- res0[8] = tbuf[0];
- res0[9] = tbuf[1];
- res0[10] = tbuf[2];
- res0[11] = tbuf[3];
- res0[12] = tbuf[4];
- res0[13] = tbuf[5];
- res0[14] = tbuf[6];
- res0[15] = tbuf[7];
-}
-
-static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
- in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16));
- in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16));
- in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16));
- in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16));
- in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16));
- in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16));
- in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16));
- in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16));
-
- in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16));
- in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16));
- in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16));
- in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16));
- in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16));
- in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16));
- in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16));
- in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16));
-}
-
-#define RECON_AND_STORE(dest, in_x) \
- { \
- __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
- d0 = _mm_unpacklo_epi8(d0, zero); \
- d0 = _mm_add_epi16(in_x, d0); \
- d0 = _mm_packus_epi16(d0, d0); \
- _mm_storel_epi64((__m128i *)(dest), d0); \
- }
-
-static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- const __m128i zero = _mm_setzero_si128();
- // Final rounding and shift
- in[0] = _mm_adds_epi16(in[0], final_rounding);
- in[1] = _mm_adds_epi16(in[1], final_rounding);
- in[2] = _mm_adds_epi16(in[2], final_rounding);
- in[3] = _mm_adds_epi16(in[3], final_rounding);
- in[4] = _mm_adds_epi16(in[4], final_rounding);
- in[5] = _mm_adds_epi16(in[5], final_rounding);
- in[6] = _mm_adds_epi16(in[6], final_rounding);
- in[7] = _mm_adds_epi16(in[7], final_rounding);
- in[8] = _mm_adds_epi16(in[8], final_rounding);
- in[9] = _mm_adds_epi16(in[9], final_rounding);
- in[10] = _mm_adds_epi16(in[10], final_rounding);
- in[11] = _mm_adds_epi16(in[11], final_rounding);
- in[12] = _mm_adds_epi16(in[12], final_rounding);
- in[13] = _mm_adds_epi16(in[13], final_rounding);
- in[14] = _mm_adds_epi16(in[14], final_rounding);
- in[15] = _mm_adds_epi16(in[15], final_rounding);
-
- in[0] = _mm_srai_epi16(in[0], 6);
- in[1] = _mm_srai_epi16(in[1], 6);
- in[2] = _mm_srai_epi16(in[2], 6);
- in[3] = _mm_srai_epi16(in[3], 6);
- in[4] = _mm_srai_epi16(in[4], 6);
- in[5] = _mm_srai_epi16(in[5], 6);
- in[6] = _mm_srai_epi16(in[6], 6);
- in[7] = _mm_srai_epi16(in[7], 6);
- in[8] = _mm_srai_epi16(in[8], 6);
- in[9] = _mm_srai_epi16(in[9], 6);
- in[10] = _mm_srai_epi16(in[10], 6);
- in[11] = _mm_srai_epi16(in[11], 6);
- in[12] = _mm_srai_epi16(in[12], 6);
- in[13] = _mm_srai_epi16(in[13], 6);
- in[14] = _mm_srai_epi16(in[14], 6);
- in[15] = _mm_srai_epi16(in[15], 6);
-
- RECON_AND_STORE(dest + 0 * stride, in[0]);
- RECON_AND_STORE(dest + 1 * stride, in[1]);
- RECON_AND_STORE(dest + 2 * stride, in[2]);
- RECON_AND_STORE(dest + 3 * stride, in[3]);
- RECON_AND_STORE(dest + 4 * stride, in[4]);
- RECON_AND_STORE(dest + 5 * stride, in[5]);
- RECON_AND_STORE(dest + 6 * stride, in[6]);
- RECON_AND_STORE(dest + 7 * stride, in[7]);
- RECON_AND_STORE(dest + 8 * stride, in[8]);
- RECON_AND_STORE(dest + 9 * stride, in[9]);
- RECON_AND_STORE(dest + 10 * stride, in[10]);
- RECON_AND_STORE(dest + 11 * stride, in[11]);
- RECON_AND_STORE(dest + 12 * stride, in[12]);
- RECON_AND_STORE(dest + 13 * stride, in[13]);
- RECON_AND_STORE(dest + 14 * stride, in[14]);
- RECON_AND_STORE(dest + 15 * stride, in[15]);
-}
-
-#endif // AOM_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/av1/common/x86/hybrid_inv_txfm_avx2.c b/av1/common/x86/hybrid_inv_txfm_avx2.c
new file mode 100644
index 0000000..754152c
--- /dev/null
+++ b/av1/common/x86/hybrid_inv_txfm_avx2.c
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> // avx2
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
+#if CONFIG_AOM_HIGHBITDEPTH
+ *in = _mm256_setr_epi16(
+ (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
+ (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
+ (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
+ (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
+ (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
+ (int16_t)coeff[15]);
+#else
+ *in = _mm256_loadu_si256((const __m256i *)coeff);
+#endif
+}
+
+static void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
+ int i = 0;
+ while (i < 16) {
+ load_coeff(coeff + (i << 4), &in[i]);
+ i += 1;
+ }
+}
+
+static void recon_and_store(const __m256i *res, uint8_t *output) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i x = _mm_loadu_si128((__m128i const *)output);
+ __m128i p0 = _mm_unpacklo_epi8(x, zero);
+ __m128i p1 = _mm_unpackhi_epi8(x, zero);
+
+ p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res));
+ p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1));
+ x = _mm_packus_epi16(p0, p1);
+ _mm_storeu_si128((__m128i *)output, x);
+}
+
+#define IDCT_ROUNDING_POS (6)
+
+static void write_buffer_16x16(__m256i *in, const int stride, uint8_t *output) {
+ const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1));
+ int i = 0;
+
+ while (i < 16) {
+ in[i] = _mm256_add_epi16(in[i], rounding);
+ in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS);
+ recon_and_store(&in[i], output + i * stride);
+ i += 1;
+ }
+}
+
+static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1,
+ const __m256i *c0, const __m256i *c1,
+ __m256i *b0, __m256i *b1) {
+ __m256i x0, x1;
+ x0 = _mm256_unpacklo_epi16(*a0, *a1);
+ x1 = _mm256_unpackhi_epi16(*a0, *a1);
+ *b0 = butter_fly(x0, x1, *c0);
+ *b1 = butter_fly(x0, x1, *c1);
+}
+
+static void idct16_avx2(__m256i *in) {
+ const __m256i cospi_p30_m02 = pair256_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m256i cospi_p02_p30 = pair256_set_epi16(cospi_2_64, cospi_30_64);
+ const __m256i cospi_p14_m18 = pair256_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m256i cospi_p18_p14 = pair256_set_epi16(cospi_18_64, cospi_14_64);
+ const __m256i cospi_p22_m10 = pair256_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m256i cospi_p10_p22 = pair256_set_epi16(cospi_10_64, cospi_22_64);
+ const __m256i cospi_p06_m26 = pair256_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m256i cospi_p26_p06 = pair256_set_epi16(cospi_26_64, cospi_6_64);
+ const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
+ const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
+ const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+ const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
+ const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
+ const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m256i t0, t1, t2, t3, t4, t5, t6, t7;
+
+ // stage 1, (0-7)
+ u0 = in[0];
+ u1 = in[8];
+ u2 = in[4];
+ u3 = in[12];
+ u4 = in[2];
+ u5 = in[10];
+ u6 = in[6];
+ u7 = in[14];
+
+ // stage 2, (0-7)
+ // stage 3, (0-7)
+ t0 = u0;
+ t1 = u1;
+ t2 = u2;
+ t3 = u3;
+ unpack_butter_fly(&u4, &u7, &cospi_p28_m04, &cospi_p04_p28, &t4, &t7);
+ unpack_butter_fly(&u5, &u6, &cospi_p12_m20, &cospi_p20_p12, &t5, &t6);
+
+ // stage 4, (0-7)
+ unpack_butter_fly(&t0, &t1, &cospi_p16_p16, &cospi_p16_m16, &u0, &u1);
+ unpack_butter_fly(&t2, &t3, &cospi_p24_m08, &cospi_p08_p24, &u2, &u3);
+ u4 = _mm256_add_epi16(t4, t5);
+ u5 = _mm256_sub_epi16(t4, t5);
+ u6 = _mm256_sub_epi16(t7, t6);
+ u7 = _mm256_add_epi16(t7, t6);
+
+ // stage 5, (0-7)
+ t0 = _mm256_add_epi16(u0, u3);
+ t1 = _mm256_add_epi16(u1, u2);
+ t2 = _mm256_sub_epi16(u1, u2);
+ t3 = _mm256_sub_epi16(u0, u3);
+ t4 = u4;
+ t7 = u7;
+ unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6);
+
+ // stage 6, (0-7)
+ u0 = _mm256_add_epi16(t0, t7);
+ u1 = _mm256_add_epi16(t1, t6);
+ u2 = _mm256_add_epi16(t2, t5);
+ u3 = _mm256_add_epi16(t3, t4);
+ u4 = _mm256_sub_epi16(t3, t4);
+ u5 = _mm256_sub_epi16(t2, t5);
+ u6 = _mm256_sub_epi16(t1, t6);
+ u7 = _mm256_sub_epi16(t0, t7);
+
+ // stage 1, (8-15)
+ v0 = in[1];
+ v1 = in[9];
+ v2 = in[5];
+ v3 = in[13];
+ v4 = in[3];
+ v5 = in[11];
+ v6 = in[7];
+ v7 = in[15];
+
+ // stage 2, (8-15)
+ unpack_butter_fly(&v0, &v7, &cospi_p30_m02, &cospi_p02_p30, &t0, &t7);
+ unpack_butter_fly(&v1, &v6, &cospi_p14_m18, &cospi_p18_p14, &t1, &t6);
+ unpack_butter_fly(&v2, &v5, &cospi_p22_m10, &cospi_p10_p22, &t2, &t5);
+ unpack_butter_fly(&v3, &v4, &cospi_p06_m26, &cospi_p26_p06, &t3, &t4);
+
+ // stage 3, (8-15)
+ v0 = _mm256_add_epi16(t0, t1);
+ v1 = _mm256_sub_epi16(t0, t1);
+ v2 = _mm256_sub_epi16(t3, t2);
+ v3 = _mm256_add_epi16(t2, t3);
+ v4 = _mm256_add_epi16(t4, t5);
+ v5 = _mm256_sub_epi16(t4, t5);
+ v6 = _mm256_sub_epi16(t7, t6);
+ v7 = _mm256_add_epi16(t6, t7);
+
+ // stage 4, (8-15)
+ t0 = v0;
+ t7 = v7;
+ t3 = v3;
+ t4 = v4;
+ unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
+ unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
+
+ // stage 5, (8-15)
+ v0 = _mm256_add_epi16(t0, t3);
+ v1 = _mm256_add_epi16(t1, t2);
+ v2 = _mm256_sub_epi16(t1, t2);
+ v3 = _mm256_sub_epi16(t0, t3);
+ v4 = _mm256_sub_epi16(t7, t4);
+ v5 = _mm256_sub_epi16(t6, t5);
+ v6 = _mm256_add_epi16(t6, t5);
+ v7 = _mm256_add_epi16(t7, t4);
+
+ // stage 6, (8-15)
+ t0 = v0;
+ t1 = v1;
+ t6 = v6;
+ t7 = v7;
+ unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &t2, &t5);
+ unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &t3, &t4);
+
+ // stage 7
+ in[0] = _mm256_add_epi16(u0, t7);
+ in[1] = _mm256_add_epi16(u1, t6);
+ in[2] = _mm256_add_epi16(u2, t5);
+ in[3] = _mm256_add_epi16(u3, t4);
+ in[4] = _mm256_add_epi16(u4, t3);
+ in[5] = _mm256_add_epi16(u5, t2);
+ in[6] = _mm256_add_epi16(u6, t1);
+ in[7] = _mm256_add_epi16(u7, t0);
+ in[8] = _mm256_sub_epi16(u7, t0);
+ in[9] = _mm256_sub_epi16(u6, t1);
+ in[10] = _mm256_sub_epi16(u5, t2);
+ in[11] = _mm256_sub_epi16(u4, t3);
+ in[12] = _mm256_sub_epi16(u3, t4);
+ in[13] = _mm256_sub_epi16(u2, t5);
+ in[14] = _mm256_sub_epi16(u1, t6);
+ in[15] = _mm256_sub_epi16(u0, t7);
+}
+
+static void idct16(__m256i *in) {
+ mm256_transpose_16x16(in);
+ idct16_avx2(in);
+}
+
+static INLINE void butterfly_32b(const __m256i *a0, const __m256i *a1,
+ const __m256i *c0, const __m256i *c1,
+ __m256i *b) {
+ __m256i x0, x1;
+ x0 = _mm256_unpacklo_epi16(*a0, *a1);
+ x1 = _mm256_unpackhi_epi16(*a0, *a1);
+ b[0] = _mm256_madd_epi16(x0, *c0);
+ b[1] = _mm256_madd_epi16(x1, *c0);
+ b[2] = _mm256_madd_epi16(x0, *c1);
+ b[3] = _mm256_madd_epi16(x1, *c1);
+}
+
+static INLINE void group_rounding(__m256i *a, int num) {
+ const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+ int i;
+ for (i = 0; i < num; ++i) {
+ a[i] = _mm256_add_epi32(a[i], dct_rounding);
+ a[i] = _mm256_srai_epi32(a[i], DCT_CONST_BITS);
+ }
+}
+
+static INLINE void add_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
+ __m256i x[4];
+ x[0] = _mm256_add_epi32(a[0], b[0]);
+ x[1] = _mm256_add_epi32(a[1], b[1]);
+ x[2] = _mm256_add_epi32(a[2], b[2]);
+ x[3] = _mm256_add_epi32(a[3], b[3]);
+
+ group_rounding(x, 4);
+
+ out[0] = _mm256_packs_epi32(x[0], x[1]);
+ out[1] = _mm256_packs_epi32(x[2], x[3]);
+}
+
+static INLINE void sub_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
+ __m256i x[4];
+ x[0] = _mm256_sub_epi32(a[0], b[0]);
+ x[1] = _mm256_sub_epi32(a[1], b[1]);
+ x[2] = _mm256_sub_epi32(a[2], b[2]);
+ x[3] = _mm256_sub_epi32(a[3], b[3]);
+
+ group_rounding(x, 4);
+
+ out[0] = _mm256_packs_epi32(x[0], x[1]);
+ out[1] = _mm256_packs_epi32(x[2], x[3]);
+}
+
+static INLINE void butterfly_rnd(__m256i *a, __m256i *out) {
+ group_rounding(a, 4);
+ out[0] = _mm256_packs_epi32(a[0], a[1]);
+ out[1] = _mm256_packs_epi32(a[2], a[3]);
+}
+
+static void iadst16_avx2(__m256i *in) {
+ const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64);
+ const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64);
+ const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64);
+ const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64);
+ const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64);
+ const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
+ const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64);
+ const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64);
+ const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
+ const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
+ const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64);
+ const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64);
+ const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
+ const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64);
+ const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+ const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i x[16], s[16];
+ __m256i u[4], v[4];
+
+ // stage 1
+ butterfly_32b(&in[15], &in[0], &cospi_p01_p31, &cospi_p31_m01, u);
+ butterfly_32b(&in[7], &in[8], &cospi_p17_p15, &cospi_p15_m17, v);
+ add_rnd(u, v, &x[0]);
+ sub_rnd(u, v, &x[8]);
+
+ butterfly_32b(&in[13], &in[2], &cospi_p05_p27, &cospi_p27_m05, u);
+ butterfly_32b(&in[5], &in[10], &cospi_p21_p11, &cospi_p11_m21, v);
+ add_rnd(u, v, &x[2]);
+ sub_rnd(u, v, &x[10]);
+
+ butterfly_32b(&in[11], &in[4], &cospi_p09_p23, &cospi_p23_m09, u);
+ butterfly_32b(&in[3], &in[12], &cospi_p25_p07, &cospi_p07_m25, v);
+ add_rnd(u, v, &x[4]);
+ sub_rnd(u, v, &x[12]);
+
+ butterfly_32b(&in[9], &in[6], &cospi_p13_p19, &cospi_p19_m13, u);
+ butterfly_32b(&in[1], &in[14], &cospi_p29_p03, &cospi_p03_m29, v);
+ add_rnd(u, v, &x[6]);
+ sub_rnd(u, v, &x[14]);
+
+ // stage 2
+ s[0] = _mm256_add_epi16(x[0], x[4]);
+ s[1] = _mm256_add_epi16(x[1], x[5]);
+ s[2] = _mm256_add_epi16(x[2], x[6]);
+ s[3] = _mm256_add_epi16(x[3], x[7]);
+ s[4] = _mm256_sub_epi16(x[0], x[4]);
+ s[5] = _mm256_sub_epi16(x[1], x[5]);
+ s[6] = _mm256_sub_epi16(x[2], x[6]);
+ s[7] = _mm256_sub_epi16(x[3], x[7]);
+ butterfly_32b(&x[8], &x[9], &cospi_p04_p28, &cospi_p28_m04, u);
+ butterfly_32b(&x[12], &x[13], &cospi_m28_p04, &cospi_p04_p28, v);
+ add_rnd(u, v, &s[8]);
+ sub_rnd(u, v, &s[12]);
+
+ butterfly_32b(&x[10], &x[11], &cospi_p20_p12, &cospi_p12_m20, u);
+ butterfly_32b(&x[14], &x[15], &cospi_m12_p20, &cospi_p20_p12, v);
+ add_rnd(u, v, &s[10]);
+ sub_rnd(u, v, &s[14]);
+
+ // stage 3
+ x[0] = _mm256_add_epi16(s[0], s[2]);
+ x[1] = _mm256_add_epi16(s[1], s[3]);
+ x[2] = _mm256_sub_epi16(s[0], s[2]);
+ x[3] = _mm256_sub_epi16(s[1], s[3]);
+
+ x[8] = _mm256_add_epi16(s[8], s[10]);
+ x[9] = _mm256_add_epi16(s[9], s[11]);
+ x[10] = _mm256_sub_epi16(s[8], s[10]);
+ x[11] = _mm256_sub_epi16(s[9], s[11]);
+
+ butterfly_32b(&s[4], &s[5], &cospi_p08_p24, &cospi_p24_m08, u);
+ butterfly_32b(&s[6], &s[7], &cospi_m24_p08, &cospi_p08_p24, v);
+ add_rnd(u, v, &x[4]);
+ sub_rnd(u, v, &x[6]);
+
+ butterfly_32b(&s[12], &s[13], &cospi_p08_p24, &cospi_p24_m08, u);
+ butterfly_32b(&s[14], &s[15], &cospi_m24_p08, &cospi_p08_p24, v);
+ add_rnd(u, v, &x[12]);
+ sub_rnd(u, v, &x[14]);
+
+ // stage 4
+ butterfly_32b(&x[2], &x[3], &cospi_m16_m16, &cospi_p16_m16, u);
+ butterfly_32b(&x[6], &x[7], &cospi_p16_p16, &cospi_m16_p16, v);
+ butterfly_rnd(u, &x[2]);
+ butterfly_rnd(v, &x[6]);
+
+ butterfly_32b(&x[10], &x[11], &cospi_p16_p16, &cospi_m16_p16, u);
+ butterfly_32b(&x[14], &x[15], &cospi_m16_m16, &cospi_p16_m16, v);
+ butterfly_rnd(u, &x[10]);
+ butterfly_rnd(v, &x[14]);
+
+ in[0] = x[0];
+ in[1] = _mm256_sub_epi16(zero, x[8]);
+ in[2] = x[12];
+ in[3] = _mm256_sub_epi16(zero, x[4]);
+ in[4] = x[6];
+ in[5] = x[14];
+ in[6] = x[10];
+ in[7] = x[2];
+ in[8] = x[3];
+ in[9] = x[11];
+ in[10] = x[15];
+ in[11] = x[7];
+ in[12] = x[5];
+ in[13] = _mm256_sub_epi16(zero, x[13]);
+ in[14] = x[9];
+ in[15] = _mm256_sub_epi16(zero, x[1]);
+}
+
+static void iadst16(__m256i *in) {
+ mm256_transpose_16x16(in);
+ iadst16_avx2(in);
+}
+
+#if CONFIG_EXT_TX
+static void flip_row(__m256i *in, int rows) {
+ int i;
+ for (i = 0; i < rows; ++i) {
+ mm256_reverse_epi16(&in[i]);
+ }
+}
+
+static void flip_col(uint8_t **dest, int *stride, int rows) {
+ *dest = *dest + (rows - 1) * (*stride);
+ *stride = -*stride;
+}
+
+static void iidtx16(__m256i *in) {
+ mm256_transpose_16x16(in);
+ txfm_scaling16_avx2(Sqrt2, in);
+}
+#endif
+
+void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride, int tx_type) {
+ __m256i in[16];
+
+ load_buffer_16x16(input, in);
+ switch (tx_type) {
+ case DCT_DCT:
+ idct16(in);
+ idct16(in);
+ break;
+ case ADST_DCT:
+ idct16(in);
+ iadst16(in);
+ break;
+ case DCT_ADST:
+ iadst16(in);
+ idct16(in);
+ break;
+ case ADST_ADST:
+ iadst16(in);
+ iadst16(in);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ idct16(in);
+ iadst16(in);
+ flip_col(&dest, &stride, 16);
+ break;
+ case DCT_FLIPADST:
+ iadst16(in);
+ idct16(in);
+ flip_row(in, 16);
+ break;
+ case FLIPADST_FLIPADST:
+ iadst16(in);
+ iadst16(in);
+ flip_row(in, 16);
+ flip_col(&dest, &stride, 16);
+ break;
+ case ADST_FLIPADST:
+ iadst16(in);
+ iadst16(in);
+ flip_row(in, 16);
+ break;
+ case FLIPADST_ADST:
+ iadst16(in);
+ iadst16(in);
+ flip_col(&dest, &stride, 16);
+ break;
+ case V_DCT:
+ iidtx16(in);
+ idct16(in);
+ break;
+ case H_DCT:
+ idct16(in);
+ iidtx16(in);
+ break;
+ case V_ADST:
+ iidtx16(in);
+ iadst16(in);
+ break;
+ case H_ADST:
+ iadst16(in);
+ iidtx16(in);
+ break;
+ case V_FLIPADST:
+ iidtx16(in);
+ iadst16(in);
+ flip_col(&dest, &stride, 16);
+ break;
+ case H_FLIPADST:
+ iadst16(in);
+ iidtx16(in);
+ flip_row(in, 16);
+ break;
+#endif // CONFIG_EXT_TX
+ default: assert(0); break;
+ }
+ write_buffer_16x16(in, stride, dest);
+}
diff --git a/av1/common/x86/idct_intrin_sse2.c b/av1/common/x86/idct_intrin_sse2.c
index 3298005..a6b6e1e 100644
--- a/av1/common/x86/idct_intrin_sse2.c
+++ b/av1/common/x86/idct_intrin_sse2.c
@@ -242,69 +242,6 @@
RECON_AND_STORE(dest + 7 * stride, in[7]);
}
-void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride, int tx_type) {
- __m128i in[32];
- __m128i *in0 = &in[0];
- __m128i *in1 = &in[16];
-
- load_buffer_8x16(input, in0);
- input += 8;
- load_buffer_8x16(input, in1);
-
- switch (tx_type) {
- case DCT_DCT:
- aom_idct16_sse2(in0, in1);
- aom_idct16_sse2(in0, in1);
- break;
- case ADST_DCT:
- aom_idct16_sse2(in0, in1);
- aom_iadst16_sse2(in0, in1);
- break;
- case DCT_ADST:
- aom_iadst16_sse2(in0, in1);
- aom_idct16_sse2(in0, in1);
- break;
- case ADST_ADST:
- aom_iadst16_sse2(in0, in1);
- aom_iadst16_sse2(in0, in1);
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- aom_idct16_sse2(in0, in1);
- aom_iadst16_sse2(in0, in1);
- FLIPUD_PTR(dest, stride, 16);
- break;
- case DCT_FLIPADST:
- aom_iadst16_sse2(in0, in1);
- aom_idct16_sse2(in0, in1);
- FLIPLR_16x16(in0, in1);
- break;
- case FLIPADST_FLIPADST:
- aom_iadst16_sse2(in0, in1);
- aom_iadst16_sse2(in0, in1);
- FLIPUD_PTR(dest, stride, 16);
- FLIPLR_16x16(in0, in1);
- break;
- case ADST_FLIPADST:
- aom_iadst16_sse2(in0, in1);
- aom_iadst16_sse2(in0, in1);
- FLIPLR_16x16(in0, in1);
- break;
- case FLIPADST_ADST:
- aom_iadst16_sse2(in0, in1);
- aom_iadst16_sse2(in0, in1);
- FLIPUD_PTR(dest, stride, 16);
- break;
-#endif // CONFIG_EXT_TX
- default: assert(0); break;
- }
-
- write_buffer_8x16(dest, in0, stride);
- dest += 8;
- write_buffer_8x16(dest, in1, stride);
-}
-
#if CONFIG_EXT_TX
static void iidtx16_8col(__m128i *in) {
const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
@@ -501,7 +438,98 @@
iidtx16_8col(in0);
iidtx16_8col(in1);
}
+#endif
+void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride, int tx_type) {
+ __m128i in[32];
+ __m128i *in0 = &in[0];
+ __m128i *in1 = &in[16];
+
+ load_buffer_8x16(input, in0);
+ input += 8;
+ load_buffer_8x16(input, in1);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ aom_idct16_sse2(in0, in1);
+ aom_idct16_sse2(in0, in1);
+ break;
+ case ADST_DCT:
+ aom_idct16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ break;
+ case DCT_ADST:
+ aom_iadst16_sse2(in0, in1);
+ aom_idct16_sse2(in0, in1);
+ break;
+ case ADST_ADST:
+ aom_iadst16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ aom_idct16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ FLIPUD_PTR(dest, stride, 16);
+ break;
+ case DCT_FLIPADST:
+ aom_iadst16_sse2(in0, in1);
+ aom_idct16_sse2(in0, in1);
+ FLIPLR_16x16(in0, in1);
+ break;
+ case FLIPADST_FLIPADST:
+ aom_iadst16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ FLIPUD_PTR(dest, stride, 16);
+ FLIPLR_16x16(in0, in1);
+ break;
+ case ADST_FLIPADST:
+ aom_iadst16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ FLIPLR_16x16(in0, in1);
+ break;
+ case FLIPADST_ADST:
+ aom_iadst16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ FLIPUD_PTR(dest, stride, 16);
+ break;
+ case V_DCT:
+ iidtx16_sse2(in0, in1);
+ aom_idct16_sse2(in0, in1);
+ break;
+ case H_DCT:
+ aom_idct16_sse2(in0, in1);
+ iidtx16_sse2(in0, in1);
+ break;
+ case V_ADST:
+ iidtx16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ break;
+ case H_ADST:
+ aom_iadst16_sse2(in0, in1);
+ iidtx16_sse2(in0, in1);
+ break;
+ case V_FLIPADST:
+ iidtx16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ FLIPUD_PTR(dest, stride, 16);
+ break;
+ case H_FLIPADST:
+ aom_iadst16_sse2(in0, in1);
+ iidtx16_sse2(in0, in1);
+ FLIPLR_16x16(in0, in1);
+ break;
+#endif // CONFIG_EXT_TX
+ default: assert(0); break;
+ }
+
+ write_buffer_8x16(dest, in0, stride);
+ dest += 8;
+ write_buffer_8x16(dest, in1, stride);
+}
+
+#if CONFIG_EXT_TX
static void iidtx8_sse2(__m128i *in) {
in[0] = _mm_slli_epi16(in[0], 1);
in[1] = _mm_slli_epi16(in[1], 1);
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 0eb7737..b1de763 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -134,8 +134,8 @@
#endif
static void read_inter_mode_probs(FRAME_CONTEXT *fc, aom_reader *r) {
- int i;
#if CONFIG_REF_MV
+ int i;
for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
av1_diff_update_prob(r, &fc->newmv_prob[i], ACCT_STR);
for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
@@ -148,12 +148,15 @@
av1_diff_update_prob(r, &fc->new2mv_prob, ACCT_STR);
#endif // CONFIG_EXT_INTER
#else
- int j;
#if !CONFIG_EC_ADAPT
+ int i, j;
for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
for (j = 0; j < INTER_MODES - 1; ++j)
av1_diff_update_prob(r, &fc->inter_mode_probs[i][j], ACCT_STR);
}
+#else
+ (void)fc;
+ (void)r;
#endif
#endif
}
@@ -3712,7 +3715,10 @@
#endif
FRAME_CONTEXT *const fc = cm->fc;
aom_reader r;
- int k, i, j;
+ int k, i;
+#if !CONFIG_EC_ADAPT
+ int j;
+#endif
#if !CONFIG_ANS
if (aom_reader_init(&r, data, partition_size, pbi->decrypt_cb,
@@ -3845,7 +3851,6 @@
if (cm->reference_mode != SINGLE_REFERENCE)
setup_compound_reference_mode(cm);
-
read_frame_reference_mode_probs(cm, &r);
#if !CONFIG_EC_ADAPT
diff --git a/av1/decoder/detokenize.c b/av1/decoder/detokenize.c
index ea1b50c..795b1b0 100644
--- a/av1/decoder/detokenize.c
+++ b/av1/decoder/detokenize.c
@@ -51,7 +51,8 @@
static int decode_coefs(MACROBLOCKD *xd, PLANE_TYPE type, tran_low_t *dqcoeff,
TX_SIZE tx_size, TX_TYPE tx_type, const int16_t *dq,
int ctx, const int16_t *scan, const int16_t *nb,
- aom_reader *r, const qm_val_t *iqm[2][TX_SIZES])
+ int16_t *max_scan_line, aom_reader *r,
+ const qm_val_t *iqm[2][TX_SIZES])
#else
static int decode_coefs(MACROBLOCKD *xd, PLANE_TYPE type, tran_low_t *dqcoeff,
TX_SIZE tx_size, TX_TYPE tx_type, const int16_t *dq,
@@ -343,7 +344,7 @@
#if CONFIG_AOM_QM
const int eob = decode_coefs(xd, pd->plane_type, pd->dqcoeff, tx_size,
tx_type, dequant, ctx, sc->scan, sc->neighbors,
- &sc->max_scan_line, r, pd->seg_iqmatrix[seg_id]);
+ max_scan_line, r, pd->seg_iqmatrix[seg_id]);
#else
const int eob =
decode_coefs(xd, pd->plane_type, pd->dqcoeff, tx_size, tx_type, dequant,
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 4275098..fcfae7c 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -314,7 +314,6 @@
aom_wb_write_literal(wb, data, get_unsigned_bits(max));
}
-#if !CONFIG_EC_ADAPT || !CONFIG_DAALA_EC
static void prob_diff_update(const aom_tree_index *tree,
aom_prob probs[/*n - 1*/],
const unsigned int counts[/*n - 1*/], int n,
@@ -330,6 +329,7 @@
av1_cond_prob_diff_update(w, &probs[i], branch_ct[i], probwt);
}
+#if !CONFIG_EC_ADAPT
static int prob_diff_update_savings(const aom_tree_index *tree,
aom_prob probs[/*n - 1*/],
const unsigned int counts[/*n - 1*/], int n,
@@ -2912,7 +2912,6 @@
if (mode != TX_MODE_SELECT) aom_wb_write_literal(wb, mode, 2);
}
-#if !CONFIG_EC_ADAPT
static void update_txfm_probs(AV1_COMMON *cm, aom_writer *w,
FRAME_COUNTS *counts) {
#if CONFIG_TILE_GROUPS
@@ -2928,7 +2927,6 @@
counts->tx_size[i][j], i + 2, probwt, w);
}
}
-#endif
static void write_interp_filter(InterpFilter filter,
struct aom_write_bit_buffer *wb) {
@@ -3703,9 +3701,7 @@
#if CONFIG_LOOP_RESTORATION
encode_restoration(cm, header_bc);
#endif // CONFIG_LOOP_RESTORATION
-#if !CONFIG_EC_ADAPT
update_txfm_probs(cm, header_bc, counts);
-#endif
update_coef_probs(cpi, header_bc);
#if CONFIG_VAR_TX
diff --git a/av1/encoder/encodemv.c b/av1/encoder/encodemv.c
index 40929ef..8a6ad18 100644
--- a/av1/encoder/encodemv.c
+++ b/av1/encoder/encodemv.c
@@ -169,7 +169,7 @@
void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w,
nmv_context_counts *const nmv_counts) {
- int i, j;
+ int i;
#if CONFIG_REF_MV
int nmv_ctx = 0;
for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
@@ -180,6 +180,7 @@
w);
for (i = 0; i < 2; ++i) {
+ int j;
nmv_component *comp = &mvc->comps[i];
nmv_component_counts *comp_counts = &counts->comps[i];
@@ -193,6 +194,7 @@
}
for (i = 0; i < 2; ++i) {
+ int j;
for (j = 0; j < CLASS0_SIZE; ++j)
write_mv_update(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
@@ -218,6 +220,7 @@
write_mv_update(av1_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
for (i = 0; i < 2; ++i) {
+ int j;
nmv_component *comp = &mvc->comps[i];
nmv_component_counts *comp_counts = &counts->comps[i];
@@ -231,6 +234,7 @@
}
for (i = 0; i < 2; ++i) {
+ int j;
for (j = 0; j < CLASS0_SIZE; ++j) {
write_mv_update(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index acdc13b..2fb651c 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -69,6 +69,11 @@
int speed) {
AV1_COMMON *const cm = &cpi->common;
+ // Limit memory usage for high resolutions
+ if (AOMMIN(cm->width, cm->height) > 1080) {
+ sf->use_upsampled_references = 0;
+ }
+
if (speed >= 1) {
if (AOMMIN(cm->width, cm->height) >= 720) {
sf->disable_split_mask =
diff --git a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
index f4bd142..77ae724 100644
--- a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
@@ -18,14 +18,6 @@
#include "aom_dsp/txfm_common.h"
#include "aom_dsp/x86/txfm_common_avx2.h"
-static INLINE void mm256_reverse_epi16(__m256i *u) {
- const __m256i control = _mm256_set_epi16(
- 0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100,
- 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E);
- __m256i v = _mm256_shuffle_epi8(*u, control);
- *u = _mm256_permute2x128_si256(v, v, 1);
-}
-
static int32_t get_16x16_sum(const int16_t *input, int stride) {
__m256i r0, r1, r2, r3, u0, u1;
__m256i zero = _mm256_setzero_si256();
@@ -71,134 +63,6 @@
_mm256_zeroupper();
}
-static void mm256_transpose_16x16(__m256i *in) {
- __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
- __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
- __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
- __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
- __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
- __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
- __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
- __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
-
- __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
- __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
- __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
- __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
- __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
- __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
- __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
- __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
-
- // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b
- // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f
- // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b
- // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f
- // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b
- // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f
- // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b
- // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f
-
- // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b
- // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f
- // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb
- // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf
- // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db
- // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df
- // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb
- // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff
-
- __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
- __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
- __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
- __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
- __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
- __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
- __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
- __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
-
- __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
- __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
- __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
- __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
- __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
- __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
- __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
- __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
-
- // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39
- // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b
- // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d
- // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f
- // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79
- // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b
- // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d
- // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f
-
- // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9
- // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb
- // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd
- // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf
- // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9
- // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb
- // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd
- // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff
-
- tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
- tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
- tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
- tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
- tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
- tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
- tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
- tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
-
- tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
- tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
- tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
- tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
- tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
- tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
- tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
- tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
-
- // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
- // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
- // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a
- // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b
- // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c
- // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d
- // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e
- // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f
-
- // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8
- // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9
- // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa
- // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb
- // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc
- // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd
- // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe
- // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff
-
- in[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000
- in[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001
- in[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
- in[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
- in[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
- in[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
- in[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
- in[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
-
- in[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
- in[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
- in[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
- in[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
- in[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
- in[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
- in[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
- in[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
-}
-
static INLINE void load_buffer_16x16(const int16_t *input, int stride,
int flipud, int fliplr, __m256i *in) {
if (!flipud) {
@@ -352,19 +216,6 @@
in[15] = _mm256_srai_epi16(in[15], 2);
}
-static INLINE __m256i butter_fly(__m256i a0, __m256i a1, const __m256i cospi) {
- const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
- __m256i y0 = _mm256_madd_epi16(a0, cospi);
- __m256i y1 = _mm256_madd_epi16(a1, cospi);
-
- y0 = _mm256_add_epi32(y0, dct_rounding);
- y1 = _mm256_add_epi32(y1, dct_rounding);
- y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS);
- y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS);
-
- return _mm256_packs_epi32(y0, y1);
-}
-
static void fdct16_avx2(__m256i *in) {
// sequence: cospi_L_H = pairs(L, H) and L first
const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
@@ -1099,31 +950,7 @@
}
#if CONFIG_EXT_TX
-static void fidtx16_avx2(__m256i *in) {
- const __m256i zero = _mm256_setzero_si256();
- const __m256i sqrt2_epi16 = _mm256_set1_epi16((int16_t)Sqrt2);
- const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
- __m256i u0, u1;
- int i = 0;
-
- while (i < 16) {
- in[i] = _mm256_slli_epi16(in[i], 1);
-
- u0 = _mm256_unpacklo_epi16(zero, in[i]);
- u1 = _mm256_unpackhi_epi16(zero, in[i]);
-
- u0 = _mm256_madd_epi16(u0, sqrt2_epi16);
- u1 = _mm256_madd_epi16(u1, sqrt2_epi16);
-
- u0 = _mm256_add_epi32(u0, dct_const_rounding);
- u1 = _mm256_add_epi32(u1, dct_const_rounding);
-
- u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
- u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
- in[i] = _mm256_packs_epi32(u0, u1);
- i++;
- }
-}
+static void fidtx16_avx2(__m256i *in) { txfm_scaling16_avx2(Sqrt2, in); }
#endif
void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
diff --git a/test/av1_fht16x16_test.cc b/test/av1_fht16x16_test.cc
index 4a44e16..0b89071 100644
--- a/test/av1_fht16x16_test.cc
+++ b/test/av1_fht16x16_test.cc
@@ -33,6 +33,11 @@
av1_fht16x16_c(in, out, stride, tx_type);
}
+void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
+ int tx_type) {
+ av1_iht16x16_256_add_c(in, dest, stride, tx_type);
+}
+
#if CONFIG_AOM_HIGHBITDEPTH
typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
int tx_type, int bd);
@@ -48,16 +53,6 @@
}
#endif // CONFIG_AOM_HIGHBITDEPTH
-#if HAVE_AVX2
-void dummy_inv_txfm(const tran_low_t *in, uint8_t *out, int stride,
- int tx_type) {
- (void)in;
- (void)out;
- (void)stride;
- (void)tx_type;
-}
-#endif
-
class AV1Trans16x16HT : public libaom_test::TransformTestBase,
public ::testing::TestWithParam<Ht16x16Param> {
public:
@@ -70,6 +65,7 @@
pitch_ = 16;
height_ = 16;
fwd_txfm_ref = fht16x16_ref;
+ inv_txfm_ref = iht16x16_ref;
bit_depth_ = GET_PARAM(3);
mask_ = (1 << bit_depth_) - 1;
num_coeffs_ = GET_PARAM(4);
@@ -90,6 +86,7 @@
};
TEST_P(AV1Trans16x16HT, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(AV1Trans16x16HT, InvCoeffCheck) { RunInvCoeffCheck(); }
#if CONFIG_AOM_HIGHBITDEPTH
class AV1HighbdTrans16x16HT
@@ -203,22 +200,27 @@
#if HAVE_AVX2
const Ht16x16Param kArrayHt16x16Param_avx2[] = {
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 0, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 1, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 2, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 3, AOM_BITS_8, 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 0, AOM_BITS_8, 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 1, AOM_BITS_8, 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 2, AOM_BITS_8, 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 3, AOM_BITS_8, 256),
#if CONFIG_EXT_TX
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 4, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 5, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 6, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 7, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 8, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 10, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 11, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 12, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 13, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 14, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 15, AOM_BITS_8, 256)
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 4, AOM_BITS_8, 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 5, AOM_BITS_8, 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 6, AOM_BITS_8, 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 7, AOM_BITS_8, 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 8, AOM_BITS_8, 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 10, AOM_BITS_8,
+ 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 11, AOM_BITS_8,
+ 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 12, AOM_BITS_8,
+ 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 13, AOM_BITS_8,
+ 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 14, AOM_BITS_8,
+ 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 15, AOM_BITS_8, 256)
#endif // CONFIG_EXT_TX
};
INSTANTIATE_TEST_CASE_P(AVX2, AV1Trans16x16HT,
diff --git a/test/av1_inv_txfm_test.cc b/test/av1_inv_txfm_test.cc
index 84e2402..8f6c868 100644
--- a/test/av1_inv_txfm_test.cc
+++ b/test/av1_inv_txfm_test.cc
@@ -24,7 +24,7 @@
#include "av1/common/blockd.h"
#include "av1/common/scan.h"
#include "aom/aom_integer.h"
-#include "av1/common/av1_inv_txfm.h"
+#include "aom_dsp/inv_txfm.h"
using libaom_test::ACMRandom;
@@ -104,10 +104,10 @@
INSTANTIATE_TEST_CASE_P(
C, AV1InvTxfm,
- ::testing::Values(IdctParam(&av1_idct4_c, &reference_idct_1d, 4, 1),
- IdctParam(&av1_idct8_c, &reference_idct_1d, 8, 2),
- IdctParam(&av1_idct16_c, &reference_idct_1d, 16, 4),
- IdctParam(&av1_idct32_c, &reference_idct_1d, 32, 6)));
+ ::testing::Values(IdctParam(&aom_idct4_c, &reference_idct_1d, 4, 1),
+ IdctParam(&aom_idct8_c, &reference_idct_1d, 8, 2),
+ IdctParam(&aom_idct16_c, &reference_idct_1d, 16, 4),
+ IdctParam(&aom_idct32_c, &reference_idct_1d, 32, 6)));
#if CONFIG_AV1_ENCODER
typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
@@ -262,19 +262,19 @@
INSTANTIATE_TEST_CASE_P(
C, AV1PartialIDctTest,
- ::testing::Values(make_tuple(&av1_fdct32x32_c, &av1_idct32x32_1024_add_c,
- &av1_idct32x32_34_add_c, TX_32X32, 34),
- make_tuple(&av1_fdct32x32_c, &av1_idct32x32_1024_add_c,
- &av1_idct32x32_1_add_c, TX_32X32, 1),
- make_tuple(&av1_fdct16x16_c, &av1_idct16x16_256_add_c,
- &av1_idct16x16_10_add_c, TX_16X16, 10),
- make_tuple(&av1_fdct16x16_c, &av1_idct16x16_256_add_c,
- &av1_idct16x16_1_add_c, TX_16X16, 1),
- make_tuple(&av1_fdct8x8_c, &av1_idct8x8_64_add_c,
- &av1_idct8x8_12_add_c, TX_8X8, 12),
- make_tuple(&av1_fdct8x8_c, &av1_idct8x8_64_add_c,
- &av1_idct8x8_1_add_c, TX_8X8, 1),
- make_tuple(&av1_fdct4x4_c, &av1_idct4x4_16_add_c,
- &av1_idct4x4_1_add_c, TX_4X4, 1)));
+ ::testing::Values(make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c,
+ &aom_idct32x32_34_add_c, TX_32X32, 34),
+ make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c,
+ &aom_idct32x32_1_add_c, TX_32X32, 1),
+ make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c,
+ &aom_idct16x16_10_add_c, TX_16X16, 10),
+ make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c,
+ &aom_idct16x16_1_add_c, TX_16X16, 1),
+ make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c,
+ &aom_idct8x8_12_add_c, TX_8X8, 12),
+ make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c,
+ &aom_idct8x8_1_add_c, TX_8X8, 1),
+ make_tuple(&aom_fdct4x4_c, &aom_idct4x4_16_add_c,
+ &aom_idct4x4_1_add_c, TX_4X4, 1)));
#endif // CONFIG_AV1_ENCODER
} // namespace
diff --git a/test/transform_test_base.h b/test/transform_test_base.h
index 540136c..64bf2d6 100644
--- a/test/transform_test_base.h
+++ b/test/transform_test_base.h
@@ -210,7 +210,7 @@
int out_idx = j * stride + k;
ASSERT_EQ(output_block[out_idx], output_ref_block[out_idx])
<< "Error: not bit-exact result at index: " << out_idx
- << " at test block: " << i;
+ << " j = " << j << " k = " << k << " at test block: " << i;
}
}
}