Update partial inverse DCT according to VP9
- Partial inverse DCT unit tests have been enhanced.
- IDCT x86_64 assembly code has been removed.
Change-Id: Ic3bed2c0e70abdfd642a4f74fa969cc672d4795f
diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index 2111d62..8c7241b 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -238,9 +238,7 @@
DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h
DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm
-ifeq ($(ARCH_X86_64),yes)
-DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3_x86_64.asm
-endif # ARCH_X86_64
+DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.c
ifeq ($(HAVE_NEON_ASM),yes)
DSP_SRCS-yes += arm/save_reg_neon$(ASM)
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 06fd899..36669fe 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -425,10 +425,10 @@
specialize qw/aom_idct4x4_1_add sse2/;
add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct8x8_64_add sse2/, "$ssse3_x86_64";
+ specialize qw/aom_idct8x8_64_add sse2 ssse3/;
add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct8x8_12_add sse2/, "$ssse3_x86_64";
+ specialize qw/aom_idct8x8_12_add sse2 ssse3/;
add_proto qw/void aom_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct8x8_1_add sse2/;
@@ -436,6 +436,8 @@
add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct16x16_256_add sse2/;
+ add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+
add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct16x16_10_add sse2/;
@@ -443,15 +445,15 @@
specialize qw/aom_idct16x16_1_add sse2/;
add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct32x32_1024_add sse2/, "$ssse3_x86_64";
+ specialize qw/aom_idct32x32_1024_add sse2 ssse3/;
add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct32x32_135_add sse2/, "$ssse3_x86_64";
+ specialize qw/aom_idct32x32_135_add sse2 ssse3/;
# Need to add 135 eob idct32x32 implementations.
$aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2;
add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct32x32_34_add sse2/, "$ssse3_x86_64";
+ specialize qw/aom_idct32x32_34_add sse2 ssse3/;
add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct32x32_1_add sse2/;
@@ -480,10 +482,10 @@
specialize qw/aom_idct8x8_1_add sse2 neon dspr2 msa/;
add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
+ specialize qw/aom_idct8x8_64_add sse2 ssse3 neon dspr2 msa/;
add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
+ specialize qw/aom_idct8x8_12_add sse2 ssse3 neon dspr2 msa/;
add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct16x16_1_add sse2 neon dspr2 msa/;
@@ -491,14 +493,16 @@
add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct16x16_256_add sse2 neon dspr2 msa/;
+ add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+
add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct16x16_10_add sse2 neon dspr2 msa/;
add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
+ specialize qw/aom_idct32x32_1024_add sse2 ssse3 neon dspr2 msa/;
add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
+ specialize qw/aom_idct32x32_135_add sse2 ssse3 neon dspr2 msa/;
# Need to add 135 eob idct32x32 implementations.
$aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2;
$aom_idct32x32_135_add_neon=aom_idct32x32_1024_add_neon;
@@ -506,7 +510,7 @@
$aom_idct32x32_135_add_msa=aom_idct32x32_1024_add_msa;
add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct32x32_34_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
+ specialize qw/aom_idct32x32_34_add sse2 ssse3 neon dspr2 msa/;
# Need to add 34 eob idct32x32 neon implementation.
$aom_idct32x32_34_add_neon=aom_idct32x32_1024_add_neon;
diff --git a/aom_dsp/inv_txfm.c b/aom_dsp/inv_txfm.c
index d1735cf..51a9633 100644
--- a/aom_dsp/inv_txfm.c
+++ b/aom_dsp/inv_txfm.c
@@ -747,6 +747,32 @@
output[15] = WRAPLOW(-x1);
}
+void aom_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int i, j;
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[16], temp_out[16];
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 8x8 area, we only need to calculate first 8 rows here.
+ for (i = 0; i < 8; ++i) {
+ aom_idct16_c(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+ aom_idct16_c(temp_in, temp_out);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
+}
+
void aom_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
int stride) {
tran_low_t out[16 * 16] = { 0 };
diff --git a/aom_dsp/x86/inv_txfm_sse2.c b/aom_dsp/x86/inv_txfm_sse2.c
index 0ba452f0..1e916e0 100644
--- a/aom_dsp/x86/inv_txfm_sse2.c
+++ b/aom_dsp/x86/inv_txfm_sse2.c
@@ -258,63 +258,6 @@
in[1] = _mm_packs_epi32(u[2], u[3]);
}
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3, out4, out5, out6, out7) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
- const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
- const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
- const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
- const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
- const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
- \
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
- \
- out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
- out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
- out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
- out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
- out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
- out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
- out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
- out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
- }
-
-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
- { \
- const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
- const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
- const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
- \
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
- \
- out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
- out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
- out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
- out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
- }
-
-#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- }
-
// Define Macro for multiplying elements by constants and adding them together.
#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
res0, res1, res2, res3) \
diff --git a/aom_dsp/x86/inv_txfm_sse2.h b/aom_dsp/x86/inv_txfm_sse2.h
index 1e94a5d..95d246c 100644
--- a/aom_dsp/x86/inv_txfm_sse2.h
+++ b/aom_dsp/x86/inv_txfm_sse2.h
@@ -56,6 +56,37 @@
res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
}
+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+ const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
+ const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
+ const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
+ \
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
+ \
+ out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+ out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+ out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+ out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+ out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
+ out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
+ out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
+ out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
+ }
+
#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
{ \
const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
@@ -195,6 +226,32 @@
RECON_AND_STORE(dest + 15 * stride, in[15]);
}
+#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
+ { \
+ const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
+ const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
+ const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
+ \
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+ \
+ out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+ out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+ out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+ out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+ }
+
+#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+ out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ }
+
void iadst16_8col(__m128i *in);
void idct16_8col(__m128i *in);
void aom_idct4_sse2(__m128i *in);
diff --git a/aom_dsp/x86/inv_txfm_ssse3.c b/aom_dsp/x86/inv_txfm_ssse3.c
new file mode 100644
index 0000000..9d00679
--- /dev/null
+++ b/aom_dsp/x86/inv_txfm_ssse3.c
@@ -0,0 +1,1333 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/x86/inv_txfm_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+void aom_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+ const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i;
+
+ // Load input data.
+ in0 = load_input_data(input);
+ in1 = load_input_data(input + 8 * 1);
+ in2 = load_input_data(input + 8 * 2);
+ in3 = load_input_data(input + 8 * 3);
+ in4 = load_input_data(input + 8 * 4);
+ in5 = load_input_data(input + 8 * 5);
+ in6 = load_input_data(input + 8 * 6);
+ in7 = load_input_data(input + 8 * 7);
+
+ // 2-D
+ for (i = 0; i < 2; i++) {
+ // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
+ TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+
+ // 4-stage 1D idct8x8
+ {
+ /* Stage1 */
+ {
+ const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);
+ const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);
+ const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);
+ const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);
+
+ {
+ tmp0 = _mm_madd_epi16(lo_17, stg1_0);
+ tmp1 = _mm_madd_epi16(hi_17, stg1_0);
+ tmp2 = _mm_madd_epi16(lo_17, stg1_1);
+ tmp3 = _mm_madd_epi16(hi_17, stg1_1);
+ tmp4 = _mm_madd_epi16(lo_35, stg1_2);
+ tmp5 = _mm_madd_epi16(hi_35, stg1_2);
+ tmp6 = _mm_madd_epi16(lo_35, stg1_3);
+ tmp7 = _mm_madd_epi16(hi_35, stg1_3);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp5 = _mm_add_epi32(tmp5, rounding);
+ tmp6 = _mm_add_epi32(tmp6, rounding);
+ tmp7 = _mm_add_epi32(tmp7, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, 14);
+ tmp1 = _mm_srai_epi32(tmp1, 14);
+ tmp2 = _mm_srai_epi32(tmp2, 14);
+ tmp3 = _mm_srai_epi32(tmp3, 14);
+ tmp4 = _mm_srai_epi32(tmp4, 14);
+ tmp5 = _mm_srai_epi32(tmp5, 14);
+ tmp6 = _mm_srai_epi32(tmp6, 14);
+ tmp7 = _mm_srai_epi32(tmp7, 14);
+
+ stp1_4 = _mm_packs_epi32(tmp0, tmp1);
+ stp1_7 = _mm_packs_epi32(tmp2, tmp3);
+ stp1_5 = _mm_packs_epi32(tmp4, tmp5);
+ stp1_6 = _mm_packs_epi32(tmp6, tmp7);
+ }
+ }
+
+ /* Stage2 */
+ {
+ const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);
+ const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);
+
+ {
+ tmp0 = _mm_unpacklo_epi16(in0, in4);
+ tmp1 = _mm_unpackhi_epi16(in0, in4);
+
+ tmp2 = _mm_madd_epi16(tmp0, stk2_0);
+ tmp3 = _mm_madd_epi16(tmp1, stk2_0);
+ tmp4 = _mm_madd_epi16(tmp0, stk2_1);
+ tmp5 = _mm_madd_epi16(tmp1, stk2_1);
+
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp5 = _mm_add_epi32(tmp5, rounding);
+
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+
+ stp2_0 = _mm_packs_epi32(tmp2, tmp3);
+ stp2_1 = _mm_packs_epi32(tmp4, tmp5);
+
+ tmp0 = _mm_madd_epi16(lo_26, stg2_2);
+ tmp1 = _mm_madd_epi16(hi_26, stg2_2);
+ tmp2 = _mm_madd_epi16(lo_26, stg2_3);
+ tmp3 = _mm_madd_epi16(hi_26, stg2_3);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, 14);
+ tmp1 = _mm_srai_epi32(tmp1, 14);
+ tmp2 = _mm_srai_epi32(tmp2, 14);
+ tmp3 = _mm_srai_epi32(tmp3, 14);
+
+ stp2_2 = _mm_packs_epi32(tmp0, tmp1);
+ stp2_3 = _mm_packs_epi32(tmp2, tmp3);
+ }
+
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
+ }
+
+ /* Stage3 */
+ {
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
+
+ tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
+ tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
+
+ tmp2 = _mm_madd_epi16(tmp0, stk2_1);
+ tmp3 = _mm_madd_epi16(tmp1, stk2_1);
+ tmp4 = _mm_madd_epi16(tmp0, stk2_0);
+ tmp5 = _mm_madd_epi16(tmp1, stk2_0);
+
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp5 = _mm_add_epi32(tmp5, rounding);
+
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+
+ stp1_5 = _mm_packs_epi32(tmp2, tmp3);
+ stp1_6 = _mm_packs_epi32(tmp4, tmp5);
+ }
+
+ /* Stage4 */
+ in0 = _mm_add_epi16(stp1_0, stp2_7);
+ in1 = _mm_add_epi16(stp1_1, stp1_6);
+ in2 = _mm_add_epi16(stp1_2, stp1_5);
+ in3 = _mm_add_epi16(stp1_3, stp2_4);
+ in4 = _mm_sub_epi16(stp1_3, stp2_4);
+ in5 = _mm_sub_epi16(stp1_2, stp1_5);
+ in6 = _mm_sub_epi16(stp1_1, stp1_6);
+ in7 = _mm_sub_epi16(stp1_0, stp2_7);
+ }
+ }
+
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 5);
+ in1 = _mm_srai_epi16(in1, 5);
+ in2 = _mm_srai_epi16(in2, 5);
+ in3 = _mm_srai_epi16(in3, 5);
+ in4 = _mm_srai_epi16(in4, 5);
+ in5 = _mm_srai_epi16(in5, 5);
+ in6 = _mm_srai_epi16(in6, 5);
+ in7 = _mm_srai_epi16(in7, 5);
+
+ RECON_AND_STORE(dest + 0 * stride, in0);
+ RECON_AND_STORE(dest + 1 * stride, in1);
+ RECON_AND_STORE(dest + 2 * stride, in2);
+ RECON_AND_STORE(dest + 3 * stride, in3);
+ RECON_AND_STORE(dest + 4 * stride, in4);
+ RECON_AND_STORE(dest + 5 * stride, in5);
+ RECON_AND_STORE(dest + 6 * stride, in6);
+ RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+void aom_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+ const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
+ const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
+ const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
+ const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
+ const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
+ const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
+ const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
+ const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+
+ // Rows. Load 4-row input data.
+ in0 = load_input_data(input);
+ in1 = load_input_data(input + 8 * 1);
+ in2 = load_input_data(input + 8 * 2);
+ in3 = load_input_data(input + 8 * 3);
+
+ // 8x4 Transpose
+ TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
+
+ // Stage1
+ tmp0 = _mm_mulhrs_epi16(in0, stg1_0);
+ tmp1 = _mm_mulhrs_epi16(in0, stg1_1);
+ tmp2 = _mm_mulhrs_epi16(in1, stg1_2);
+ tmp3 = _mm_mulhrs_epi16(in1, stg1_3);
+
+ stp1_4 = _mm_unpackhi_epi64(tmp0, tmp1);
+ stp1_5 = _mm_unpackhi_epi64(tmp2, tmp3);
+
+ // Stage2
+ tmp0 = _mm_mulhrs_epi16(in0, stg2_0);
+ stp2_0 = _mm_unpacklo_epi64(tmp0, tmp0);
+
+ tmp1 = _mm_mulhrs_epi16(in1, stg2_2);
+ tmp2 = _mm_mulhrs_epi16(in1, stg2_3);
+ stp2_2 = _mm_unpacklo_epi64(tmp2, tmp1);
+
+ tmp0 = _mm_add_epi16(stp1_4, stp1_5);
+ tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
+
+ stp2_4 = tmp0;
+ stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
+ stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
+
+ tmp0 = _mm_unpacklo_epi16(stp2_5, stp2_6);
+ tmp1 = _mm_madd_epi16(tmp0, stg3_0);
+ tmp2 = _mm_madd_epi16(tmp0, stk2_0); // stg3_1 = stk2_0
+
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+
+ stp1_5 = _mm_packs_epi32(tmp1, tmp2);
+
+ // Stage3
+ tmp2 = _mm_add_epi16(stp2_0, stp2_2);
+ tmp3 = _mm_sub_epi16(stp2_0, stp2_2);
+
+ stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2);
+ stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2);
+
+ // Stage4
+ tmp0 = _mm_add_epi16(stp1_3, stp2_4);
+ tmp1 = _mm_add_epi16(stp1_2, stp1_5);
+ tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
+ tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
+
+ TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
+
+ /* Stage1 */
+ stp1_4 = _mm_mulhrs_epi16(in1, stg1_0);
+ stp1_7 = _mm_mulhrs_epi16(in1, stg1_1);
+ stp1_5 = _mm_mulhrs_epi16(in3, stg1_2);
+ stp1_6 = _mm_mulhrs_epi16(in3, stg1_3);
+
+ /* Stage2 */
+ stp2_0 = _mm_mulhrs_epi16(in0, stg2_0);
+ stp2_1 = _mm_mulhrs_epi16(in0, stg2_0);
+
+ stp2_2 = _mm_mulhrs_epi16(in2, stg2_2);
+ stp2_3 = _mm_mulhrs_epi16(in2, stg2_3);
+
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
+
+ /* Stage3 */
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
+
+ tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
+ tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
+
+ tmp2 = _mm_madd_epi16(tmp0, stk2_0);
+ tmp3 = _mm_madd_epi16(tmp1, stk2_0);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3);
+
+ tmp2 = _mm_madd_epi16(tmp0, stk2_1);
+ tmp3 = _mm_madd_epi16(tmp1, stk2_1);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ stp1_5 = _mm_packs_epi32(tmp2, tmp3);
+
+ /* Stage4 */
+ in0 = _mm_add_epi16(stp1_0, stp2_7);
+ in1 = _mm_add_epi16(stp1_1, stp1_6);
+ in2 = _mm_add_epi16(stp1_2, stp1_5);
+ in3 = _mm_add_epi16(stp1_3, stp2_4);
+ in4 = _mm_sub_epi16(stp1_3, stp2_4);
+ in5 = _mm_sub_epi16(stp1_2, stp1_5);
+ in6 = _mm_sub_epi16(stp1_1, stp1_6);
+ in7 = _mm_sub_epi16(stp1_0, stp2_7);
+
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 5);
+ in1 = _mm_srai_epi16(in1, 5);
+ in2 = _mm_srai_epi16(in2, 5);
+ in3 = _mm_srai_epi16(in3, 5);
+ in4 = _mm_srai_epi16(in4, 5);
+ in5 = _mm_srai_epi16(in5, 5);
+ in6 = _mm_srai_epi16(in6, 5);
+ in7 = _mm_srai_epi16(in7, 5);
+
+ RECON_AND_STORE(dest + 0 * stride, in0);
+ RECON_AND_STORE(dest + 1 * stride, in1);
+ RECON_AND_STORE(dest + 2 * stride, in2);
+ RECON_AND_STORE(dest + 3 * stride, in3);
+ RECON_AND_STORE(dest + 4 * stride, in4);
+ RECON_AND_STORE(dest + 5 * stride, in5);
+ RECON_AND_STORE(dest + 6 * stride, in6);
+ RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out,
+ int size) {
+ int i = 0;
+ const int num = size >> 1;
+ const int bound = size - 1;
+ while (i < num) {
+ out[i] = _mm_add_epi16(in[i], in[bound - i]);
+ out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]);
+ i++;
+ }
+}
+
+#define BUTTERFLY_PAIR(x0, x1, co0, co1) \
+ do { \
+ tmp0 = _mm_madd_epi16(x0, co0); \
+ tmp1 = _mm_madd_epi16(x1, co0); \
+ tmp2 = _mm_madd_epi16(x0, co1); \
+ tmp3 = _mm_madd_epi16(x1, co1); \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ } while (0)
+
+static INLINE void butterfly(const __m128i *x0, const __m128i *x1,
+ const __m128i *c0, const __m128i *c1, __m128i *y0,
+ __m128i *y1) {
+ __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ u0 = _mm_unpacklo_epi16(*x0, *x1);
+ u1 = _mm_unpackhi_epi16(*x0, *x1);
+ BUTTERFLY_PAIR(u0, u1, *c0, *c1);
+ *y0 = _mm_packs_epi32(tmp0, tmp1);
+ *y1 = _mm_packs_epi32(tmp2, tmp3);
+}
+
+static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0,
+ const __m128i *c1) {
+ __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ u0 = _mm_unpacklo_epi16(*x0, *x1);
+ u1 = _mm_unpackhi_epi16(*x0, *x1);
+ BUTTERFLY_PAIR(u0, u1, *c0, *c1);
+ *x0 = _mm_packs_epi32(tmp0, tmp1);
+ *x1 = _mm_packs_epi32(tmp2, tmp3);
+}
+
+static void idct32_34_first_half(const __m128i *in, __m128i *stp1) {
+ const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
+ const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
+ const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
+ const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
+
+ const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
+ const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i x0, x1, x4, x5, x6, x7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+
+ // phase 1
+
+ // 0, 15
+ u2 = _mm_mulhrs_epi16(in[2], stk2_1); // stp2_15
+ u3 = _mm_mulhrs_epi16(in[6], stk2_7); // stp2_12
+ v15 = _mm_add_epi16(u2, u3);
+ // in[0], in[4]
+ x0 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[0]
+ x7 = _mm_mulhrs_epi16(in[4], stk3_1); // stp1[7]
+ v0 = _mm_add_epi16(x0, x7); // stp2_0
+ stp1[0] = _mm_add_epi16(v0, v15);
+ stp1[15] = _mm_sub_epi16(v0, v15);
+
+ // in[2], in[6]
+ u0 = _mm_mulhrs_epi16(in[2], stk2_0); // stp2_8
+ u1 = _mm_mulhrs_epi16(in[6], stk2_6); // stp2_11
+ butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5); // stp2_9, stp2_14
+ butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7); // stp2_10, stp2_13
+
+ v8 = _mm_add_epi16(u0, u1);
+ v9 = _mm_add_epi16(u4, u6);
+ v10 = _mm_sub_epi16(u4, u6);
+ v11 = _mm_sub_epi16(u0, u1);
+ v12 = _mm_sub_epi16(u2, u3);
+ v13 = _mm_sub_epi16(u5, u7);
+ v14 = _mm_add_epi16(u5, u7);
+
+ butterfly_self(&v10, &v13, &stg6_0, &stg4_0);
+ butterfly_self(&v11, &v12, &stg6_0, &stg4_0);
+
+ // 1, 14
+ x1 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[1], stk4_1 = stk4_0
+ // stp1[2] = stp1[0], stp1[3] = stp1[1]
+ x4 = _mm_mulhrs_epi16(in[4], stk3_0); // stp1[4]
+ butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6);
+ v1 = _mm_add_epi16(x1, x6); // stp2_1
+ v2 = _mm_add_epi16(x0, x5); // stp2_2
+ stp1[1] = _mm_add_epi16(v1, v14);
+ stp1[14] = _mm_sub_epi16(v1, v14);
+
+ stp1[2] = _mm_add_epi16(v2, v13);
+ stp1[13] = _mm_sub_epi16(v2, v13);
+
+ v3 = _mm_add_epi16(x1, x4); // stp2_3
+ v4 = _mm_sub_epi16(x1, x4); // stp2_4
+
+ v5 = _mm_sub_epi16(x0, x5); // stp2_5
+
+ v6 = _mm_sub_epi16(x1, x6); // stp2_6
+ v7 = _mm_sub_epi16(x0, x7); // stp2_7
+ stp1[3] = _mm_add_epi16(v3, v12);
+ stp1[12] = _mm_sub_epi16(v3, v12);
+
+ stp1[6] = _mm_add_epi16(v6, v9);
+ stp1[9] = _mm_sub_epi16(v6, v9);
+
+ stp1[7] = _mm_add_epi16(v7, v8);
+ stp1[8] = _mm_sub_epi16(v7, v8);
+
+ stp1[4] = _mm_add_epi16(v4, v11);
+ stp1[11] = _mm_sub_epi16(v4, v11);
+
+ stp1[5] = _mm_add_epi16(v5, v10);
+ stp1[10] = _mm_sub_epi16(v5, v10);
+}
+
+static void idct32_34_second_half(const __m128i *in, __m128i *stp1) {
+ const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
+ const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
+ const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
+ const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
+ const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
+ const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
+ const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
+ const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ __m128i v16, v17, v18, v19, v20, v21, v22, v23;
+ __m128i v24, v25, v26, v27, v28, v29, v30, v31;
+ __m128i u16, u17, u18, u19, u20, u21, u22, u23;
+ __m128i u24, u25, u26, u27, u28, u29, u30, u31;
+
+ v16 = _mm_mulhrs_epi16(in[1], stk1_0);
+ v31 = _mm_mulhrs_epi16(in[1], stk1_1);
+
+ v19 = _mm_mulhrs_epi16(in[7], stk1_6);
+ v28 = _mm_mulhrs_epi16(in[7], stk1_7);
+
+ v20 = _mm_mulhrs_epi16(in[5], stk1_8);
+ v27 = _mm_mulhrs_epi16(in[5], stk1_9);
+
+ v23 = _mm_mulhrs_epi16(in[3], stk1_14);
+ v24 = _mm_mulhrs_epi16(in[3], stk1_15);
+
+ butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30);
+ butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29);
+ butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26);
+ butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25);
+
+ u16 = _mm_add_epi16(v16, v19);
+ u17 = _mm_add_epi16(v17, v18);
+ u18 = _mm_sub_epi16(v17, v18);
+ u19 = _mm_sub_epi16(v16, v19);
+ u20 = _mm_sub_epi16(v23, v20);
+ u21 = _mm_sub_epi16(v22, v21);
+ u22 = _mm_add_epi16(v22, v21);
+ u23 = _mm_add_epi16(v23, v20);
+ u24 = _mm_add_epi16(v24, v27);
+ u27 = _mm_sub_epi16(v24, v27);
+ u25 = _mm_add_epi16(v25, v26);
+ u26 = _mm_sub_epi16(v25, v26);
+ u28 = _mm_sub_epi16(v31, v28);
+ u31 = _mm_add_epi16(v28, v31);
+ u29 = _mm_sub_epi16(v30, v29);
+ u30 = _mm_add_epi16(v29, v30);
+
+ butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
+ butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
+ butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
+ butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
+
+ stp1[16] = _mm_add_epi16(u16, u23);
+ stp1[23] = _mm_sub_epi16(u16, u23);
+
+ stp1[17] = _mm_add_epi16(u17, u22);
+ stp1[22] = _mm_sub_epi16(u17, u22);
+
+ stp1[18] = _mm_add_epi16(u18, u21);
+ stp1[21] = _mm_sub_epi16(u18, u21);
+
+ stp1[19] = _mm_add_epi16(u19, u20);
+ stp1[20] = _mm_sub_epi16(u19, u20);
+
+ stp1[24] = _mm_sub_epi16(u31, u24);
+ stp1[31] = _mm_add_epi16(u24, u31);
+
+ stp1[25] = _mm_sub_epi16(u30, u25);
+ stp1[30] = _mm_add_epi16(u25, u30);
+
+ stp1[26] = _mm_sub_epi16(u29, u26);
+ stp1[29] = _mm_add_epi16(u26, u29);
+
+ stp1[27] = _mm_sub_epi16(u28, u27);
+ stp1[28] = _mm_add_epi16(u27, u28);
+
+ butterfly_self(&stp1[20], &stp1[27], &stg6_0, &stg4_0);
+ butterfly_self(&stp1[21], &stp1[26], &stg6_0, &stg4_0);
+ butterfly_self(&stp1[22], &stp1[25], &stg6_0, &stg4_0);
+ butterfly_self(&stp1[23], &stp1[24], &stg6_0, &stg4_0);
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void aom_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ __m128i in[32], col[32];
+ __m128i stp1[32];
+ int i;
+
+ // Load input data. Only need to load the top left 8x8 block.
+ in[0] = load_input_data(input);
+ in[1] = load_input_data(input + 32);
+ in[2] = load_input_data(input + 64);
+ in[3] = load_input_data(input + 96);
+ in[4] = load_input_data(input + 128);
+ in[5] = load_input_data(input + 160);
+ in[6] = load_input_data(input + 192);
+ in[7] = load_input_data(input + 224);
+
+ array_transpose_8x8(in, in);
+ idct32_34_first_half(in, stp1);
+ idct32_34_second_half(in, stp1);
+
+ // 1_D: Store 32 intermediate results for each 8x32 block.
+ add_sub_butterfly(stp1, col, 32);
+ for (i = 0; i < 4; i++) {
+ int j;
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(col + i * 8, in);
+ idct32_34_first_half(in, stp1);
+ idct32_34_second_half(in, stp1);
+
+ // 2_D: Calculate the results and store them to destination.
+ add_sub_butterfly(stp1, in, 32);
+ for (j = 0; j < 32; ++j) {
+ // Final rounding and shift
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j] = _mm_srai_epi16(in[j], 6);
+ RECON_AND_STORE(dest + j * stride, in[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+// in0[16] represents the left 8x16 block
+// in1[16] represents the right 8x16 block
+static void load_buffer_16x16(const tran_low_t *input, __m128i *in0,
+ __m128i *in1) {
+ int i;
+ for (i = 0; i < 16; i++) {
+ in0[i] = load_input_data(input);
+ in1[i] = load_input_data(input + 8);
+ input += 32;
+ }
+}
+
+static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0,
+ __m128i *out1) {
+ array_transpose_8x8(in0, out0);
+ array_transpose_8x8(&in0[8], out1);
+ array_transpose_8x8(in1, &out0[8]);
+ array_transpose_8x8(&in1[8], &out1[8]);
+}
+
+// Group the coefficient calculation into smaller functions
+// to prevent stack spillover:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+static void idct32_8x32_135_quarter_1(const __m128i *in /*in[16]*/,
+ __m128i *out /*out[8]*/) {
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+
+ {
+ const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
+ const __m128i stk4_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
+ const __m128i stk4_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
+ u0 = _mm_mulhrs_epi16(in[0], stk4_0);
+ u2 = _mm_mulhrs_epi16(in[8], stk4_2);
+ u3 = _mm_mulhrs_epi16(in[8], stk4_3);
+ u1 = u0;
+ }
+
+ v0 = _mm_add_epi16(u0, u3);
+ v1 = _mm_add_epi16(u1, u2);
+ v2 = _mm_sub_epi16(u1, u2);
+ v3 = _mm_sub_epi16(u0, u3);
+
+ {
+ const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
+ const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
+ const __m128i stk3_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
+ const __m128i stk3_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
+ u4 = _mm_mulhrs_epi16(in[4], stk3_0);
+ u7 = _mm_mulhrs_epi16(in[4], stk3_1);
+ u5 = _mm_mulhrs_epi16(in[12], stk3_2);
+ u6 = _mm_mulhrs_epi16(in[12], stk3_3);
+ }
+
+ v4 = _mm_add_epi16(u4, u5);
+ v5 = _mm_sub_epi16(u4, u5);
+ v6 = _mm_sub_epi16(u7, u6);
+ v7 = _mm_add_epi16(u7, u6);
+
+ {
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
+ }
+
+ out[0] = _mm_add_epi16(v0, v7);
+ out[1] = _mm_add_epi16(v1, v6);
+ out[2] = _mm_add_epi16(v2, v5);
+ out[3] = _mm_add_epi16(v3, v4);
+ out[4] = _mm_sub_epi16(v3, v4);
+ out[5] = _mm_sub_epi16(v2, v5);
+ out[6] = _mm_sub_epi16(v1, v6);
+ out[7] = _mm_sub_epi16(v0, v7);
+}
+
+static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/,
+ __m128i *out /*out[8]*/) {
+ __m128i u8, u9, u10, u11, u12, u13, u14, u15;
+ __m128i v8, v9, v10, v11, v12, v13, v14, v15;
+
+ {
+ const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
+ const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
+ const __m128i stk2_2 = pair_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64);
+ const __m128i stk2_3 = pair_set_epi16(2 * cospi_14_64, 2 * cospi_14_64);
+ const __m128i stk2_4 = pair_set_epi16(2 * cospi_22_64, 2 * cospi_22_64);
+ const __m128i stk2_5 = pair_set_epi16(2 * cospi_10_64, 2 * cospi_10_64);
+ const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
+ const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
+ u8 = _mm_mulhrs_epi16(in[2], stk2_0);
+ u15 = _mm_mulhrs_epi16(in[2], stk2_1);
+ u9 = _mm_mulhrs_epi16(in[14], stk2_2);
+ u14 = _mm_mulhrs_epi16(in[14], stk2_3);
+ u10 = _mm_mulhrs_epi16(in[10], stk2_4);
+ u13 = _mm_mulhrs_epi16(in[10], stk2_5);
+ u11 = _mm_mulhrs_epi16(in[6], stk2_6);
+ u12 = _mm_mulhrs_epi16(in[6], stk2_7);
+ }
+
+ v8 = _mm_add_epi16(u8, u9);
+ v9 = _mm_sub_epi16(u8, u9);
+ v10 = _mm_sub_epi16(u11, u10);
+ v11 = _mm_add_epi16(u11, u10);
+ v12 = _mm_add_epi16(u12, u13);
+ v13 = _mm_sub_epi16(u12, u13);
+ v14 = _mm_sub_epi16(u15, u14);
+ v15 = _mm_add_epi16(u15, u14);
+
+ {
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
+ butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
+ }
+
+ out[0] = _mm_add_epi16(v8, v11);
+ out[1] = _mm_add_epi16(v9, v10);
+ out[2] = _mm_sub_epi16(v9, v10);
+ out[3] = _mm_sub_epi16(v8, v11);
+ out[4] = _mm_sub_epi16(v15, v12);
+ out[5] = _mm_sub_epi16(v14, v13);
+ out[6] = _mm_add_epi16(v14, v13);
+ out[7] = _mm_add_epi16(v15, v12);
+
+ {
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
+ butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
+ }
+}
+
+// 8x32 block even indexed 8 inputs of in[16],
+// output first half 16 to out[32]
+static void idct32_8x32_quarter_1_2(const __m128i *in /*in[16]*/,
+ __m128i *out /*out[32]*/) {
+ __m128i temp[16];
+ idct32_8x32_135_quarter_1(in, temp);
+ idct32_8x32_135_quarter_2(in, &temp[8]);
+ add_sub_butterfly(temp, out, 16);
+}
+
+// 8x32 block odd indexed 8 inputs of in[16],
+// output second half 16 to out[32]
+static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/,
+ __m128i *out /*out[32]*/) {
+ __m128i v16, v17, v18, v19, v20, v21, v22, v23;
+ __m128i v24, v25, v26, v27, v28, v29, v30, v31;
+ __m128i u16, u17, u18, u19, u20, u21, u22, u23;
+ __m128i u24, u25, u26, u27, u28, u29, u30, u31;
+
+ {
+ const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
+ const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
+ const __m128i stk1_2 = pair_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64);
+ const __m128i stk1_3 = pair_set_epi16(2 * cospi_15_64, 2 * cospi_15_64);
+
+ const __m128i stk1_4 = pair_set_epi16(2 * cospi_23_64, 2 * cospi_23_64);
+ const __m128i stk1_5 = pair_set_epi16(2 * cospi_9_64, 2 * cospi_9_64);
+ const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
+ const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
+ const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
+ const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
+ const __m128i stk1_10 = pair_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64);
+ const __m128i stk1_11 = pair_set_epi16(2 * cospi_11_64, 2 * cospi_11_64);
+
+ const __m128i stk1_12 = pair_set_epi16(2 * cospi_19_64, 2 * cospi_19_64);
+ const __m128i stk1_13 = pair_set_epi16(2 * cospi_13_64, 2 * cospi_13_64);
+ const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
+ const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
+ u16 = _mm_mulhrs_epi16(in[1], stk1_0);
+ u31 = _mm_mulhrs_epi16(in[1], stk1_1);
+ u17 = _mm_mulhrs_epi16(in[15], stk1_2);
+ u30 = _mm_mulhrs_epi16(in[15], stk1_3);
+
+ u18 = _mm_mulhrs_epi16(in[9], stk1_4);
+ u29 = _mm_mulhrs_epi16(in[9], stk1_5);
+ u19 = _mm_mulhrs_epi16(in[7], stk1_6);
+ u28 = _mm_mulhrs_epi16(in[7], stk1_7);
+
+ u20 = _mm_mulhrs_epi16(in[5], stk1_8);
+ u27 = _mm_mulhrs_epi16(in[5], stk1_9);
+ u21 = _mm_mulhrs_epi16(in[11], stk1_10);
+ u26 = _mm_mulhrs_epi16(in[11], stk1_11);
+
+ u22 = _mm_mulhrs_epi16(in[13], stk1_12);
+ u25 = _mm_mulhrs_epi16(in[13], stk1_13);
+ u23 = _mm_mulhrs_epi16(in[3], stk1_14);
+ u24 = _mm_mulhrs_epi16(in[3], stk1_15);
+ }
+
+ v16 = _mm_add_epi16(u16, u17);
+ v17 = _mm_sub_epi16(u16, u17);
+ v18 = _mm_sub_epi16(u19, u18);
+ v19 = _mm_add_epi16(u19, u18);
+
+ v20 = _mm_add_epi16(u20, u21);
+ v21 = _mm_sub_epi16(u20, u21);
+ v22 = _mm_sub_epi16(u23, u22);
+ v23 = _mm_add_epi16(u23, u22);
+
+ v24 = _mm_add_epi16(u24, u25);
+ v25 = _mm_sub_epi16(u24, u25);
+ v26 = _mm_sub_epi16(u27, u26);
+ v27 = _mm_add_epi16(u27, u26);
+
+ v28 = _mm_add_epi16(u28, u29);
+ v29 = _mm_sub_epi16(u28, u29);
+ v30 = _mm_sub_epi16(u31, u30);
+ v31 = _mm_add_epi16(u31, u30);
+
+ {
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
+ butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
+ butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
+ butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
+ }
+
+ u16 = _mm_add_epi16(v16, v19);
+ u17 = _mm_add_epi16(v17, v18);
+ u18 = _mm_sub_epi16(v17, v18);
+ u19 = _mm_sub_epi16(v16, v19);
+ u20 = _mm_sub_epi16(v23, v20);
+ u21 = _mm_sub_epi16(v22, v21);
+ u22 = _mm_add_epi16(v22, v21);
+ u23 = _mm_add_epi16(v23, v20);
+
+ u24 = _mm_add_epi16(v24, v27);
+ u25 = _mm_add_epi16(v25, v26);
+ u26 = _mm_sub_epi16(v25, v26);
+ u27 = _mm_sub_epi16(v24, v27);
+ u28 = _mm_sub_epi16(v31, v28);
+ u29 = _mm_sub_epi16(v30, v29);
+ u30 = _mm_add_epi16(v29, v30);
+ u31 = _mm_add_epi16(v28, v31);
+
+ {
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
+ butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
+ butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
+ butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
+ }
+
+ out[0] = _mm_add_epi16(u16, u23);
+ out[1] = _mm_add_epi16(u17, u22);
+ out[2] = _mm_add_epi16(u18, u21);
+ out[3] = _mm_add_epi16(u19, u20);
+ v20 = _mm_sub_epi16(u19, u20);
+ v21 = _mm_sub_epi16(u18, u21);
+ v22 = _mm_sub_epi16(u17, u22);
+ v23 = _mm_sub_epi16(u16, u23);
+
+ v24 = _mm_sub_epi16(u31, u24);
+ v25 = _mm_sub_epi16(u30, u25);
+ v26 = _mm_sub_epi16(u29, u26);
+ v27 = _mm_sub_epi16(u28, u27);
+ out[12] = _mm_add_epi16(u27, u28);
+ out[13] = _mm_add_epi16(u26, u29);
+ out[14] = _mm_add_epi16(u25, u30);
+ out[15] = _mm_add_epi16(u24, u31);
+
+ {
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]);
+ butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]);
+ butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]);
+ butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]);
+ }
+}
+
+// 8x16 block, input __m128i in[16], output __m128i in[32]
+static void idct32_8x32_135(__m128i *in /*in[32]*/) {
+ __m128i out[32];
+ idct32_8x32_quarter_1_2(in, out);
+ idct32_8x32_quarter_3_4(in, &out[16]);
+ add_sub_butterfly(out, in, 32);
+}
+
+static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ const __m128i zero = _mm_setzero_si128();
+ int j = 0;
+ while (j < 32) {
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
+
+ in[j] = _mm_srai_epi16(in[j], 6);
+ in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
+
+ RECON_AND_STORE(dst, in[j]);
+ dst += stride;
+ RECON_AND_STORE(dst, in[j + 1]);
+ dst += stride;
+ j += 2;
+ }
+}
+
+static INLINE void recon_and_store(__m128i *in0, __m128i *in1, uint8_t *dest,
+ int stride) {
+ store_buffer_8x32(in0, dest, stride);
+ store_buffer_8x32(in1, dest + 8, stride);
+}
+
+static INLINE void idct32_135(__m128i *col0, __m128i *col1) {
+ idct32_8x32_135(col0);
+ idct32_8x32_135(col1);
+}
+
+typedef enum { left_16, right_16 } ColsIndicator;
+
+static void transpose_and_copy_16x16(__m128i *in0, __m128i *in1, __m128i *store,
+ ColsIndicator cols) {
+ switch (cols) {
+ case left_16: {
+ int i;
+ array_transpose_16x16(in0, in1);
+ for (i = 0; i < 16; ++i) {
+ store[i] = in0[16 + i];
+ store[16 + i] = in1[16 + i];
+ }
+ break;
+ }
+ case right_16: {
+ array_transpose_16x16_2(store, &store[16], in0, in1);
+ break;
+ }
+ default: { assert(0); }
+ }
+}
+
+// Only upper-left 16x16 has non-zero coeff
+void aom_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ // Each array represents an 8x32 block
+ __m128i col0[32], col1[32];
+ // This array represents a 16x16 block
+ __m128i temp[32];
+
+ // Load input data. Only need to load the top left 16x16 block.
+ load_buffer_16x16(input, col0, col1);
+
+ // columns
+ array_transpose_16x16(col0, col1);
+ idct32_135(col0, col1);
+
+ // rows
+ transpose_and_copy_16x16(col0, col1, temp, left_16);
+ idct32_135(col0, col1);
+ recon_and_store(col0, col1, dest, stride);
+
+ transpose_and_copy_16x16(col0, col1, temp, right_16);
+ idct32_135(col0, col1);
+ recon_and_store(col0, col1, dest + 16, stride);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i in[32]
+static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[16]*/) {
+ __m128i u8, u9, u10, u11, u12, u13, u14, u15; // stp2_
+ __m128i v8, v9, v10, v11, v12, v13, v14, v15; // stp1_
+
+ {
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15);
+ butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14);
+ }
+
+ v8 = _mm_add_epi16(u8, u9);
+ v9 = _mm_sub_epi16(u8, u9);
+ v14 = _mm_sub_epi16(u15, u14);
+ v15 = _mm_add_epi16(u15, u14);
+
+ {
+ const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13);
+ butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12);
+ }
+
+ v10 = _mm_sub_epi16(u11, u10);
+ v11 = _mm_add_epi16(u11, u10);
+ v12 = _mm_add_epi16(u12, u13);
+ v13 = _mm_sub_epi16(u12, u13);
+
+ {
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
+ butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
+ }
+
+ out[0] = _mm_add_epi16(v8, v11);
+ out[1] = _mm_add_epi16(v9, v10);
+ out[6] = _mm_add_epi16(v14, v13);
+ out[7] = _mm_add_epi16(v15, v12);
+
+ out[2] = _mm_sub_epi16(v9, v10);
+ out[3] = _mm_sub_epi16(v8, v11);
+ out[4] = _mm_sub_epi16(v15, v12);
+ out[5] = _mm_sub_epi16(v14, v13);
+
+ {
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
+ butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
+ }
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i in[32]
+static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[8]*/) {
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7; // stp1_
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7; // stp2_
+
+ {
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7);
+ butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6);
+ }
+
+ v4 = _mm_add_epi16(u4, u5);
+ v5 = _mm_sub_epi16(u4, u5);
+ v6 = _mm_sub_epi16(u7, u6);
+ v7 = _mm_add_epi16(u7, u6);
+
+ {
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
+
+ butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1);
+ butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3);
+ }
+
+ v0 = _mm_add_epi16(u0, u3);
+ v1 = _mm_add_epi16(u1, u2);
+ v2 = _mm_sub_epi16(u1, u2);
+ v3 = _mm_sub_epi16(u0, u3);
+
+ out[0] = _mm_add_epi16(v0, v7);
+ out[1] = _mm_add_epi16(v1, v6);
+ out[2] = _mm_add_epi16(v2, v5);
+ out[3] = _mm_add_epi16(v3, v4);
+ out[4] = _mm_sub_epi16(v3, v4);
+ out[5] = _mm_sub_epi16(v2, v5);
+ out[6] = _mm_sub_epi16(v1, v6);
+ out[7] = _mm_sub_epi16(v0, v7);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i in[32]
+// We avoid hide an offset, 16, inside this function. So we output 0-15 into
+// array out[16]
+static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[16]*/) {
+ __m128i v16, v17, v18, v19, v20, v21, v22, v23;
+ __m128i v24, v25, v26, v27, v28, v29, v30, v31;
+ __m128i u16, u17, u18, u19, u20, u21, u22, u23;
+ __m128i u24, u25, u26, u27, u28, u29, u30, u31;
+
+ {
+ const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+ butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31);
+ butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30);
+ butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29);
+ butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28);
+
+ butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27);
+ butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26);
+
+ butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25);
+ butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24);
+ }
+
+ v16 = _mm_add_epi16(u16, u17);
+ v17 = _mm_sub_epi16(u16, u17);
+ v18 = _mm_sub_epi16(u19, u18);
+ v19 = _mm_add_epi16(u19, u18);
+
+ v20 = _mm_add_epi16(u20, u21);
+ v21 = _mm_sub_epi16(u20, u21);
+ v22 = _mm_sub_epi16(u23, u22);
+ v23 = _mm_add_epi16(u23, u22);
+
+ v24 = _mm_add_epi16(u24, u25);
+ v25 = _mm_sub_epi16(u24, u25);
+ v26 = _mm_sub_epi16(u27, u26);
+ v27 = _mm_add_epi16(u27, u26);
+
+ v28 = _mm_add_epi16(u28, u29);
+ v29 = _mm_sub_epi16(u28, u29);
+ v30 = _mm_sub_epi16(u31, u30);
+ v31 = _mm_add_epi16(u31, u30);
+
+ {
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+ butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
+ butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
+ butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
+ butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
+ }
+
+ u16 = _mm_add_epi16(v16, v19);
+ u17 = _mm_add_epi16(v17, v18);
+ u18 = _mm_sub_epi16(v17, v18);
+ u19 = _mm_sub_epi16(v16, v19);
+ u20 = _mm_sub_epi16(v23, v20);
+ u21 = _mm_sub_epi16(v22, v21);
+ u22 = _mm_add_epi16(v22, v21);
+ u23 = _mm_add_epi16(v23, v20);
+
+ u24 = _mm_add_epi16(v24, v27);
+ u25 = _mm_add_epi16(v25, v26);
+ u26 = _mm_sub_epi16(v25, v26);
+ u27 = _mm_sub_epi16(v24, v27);
+
+ u28 = _mm_sub_epi16(v31, v28);
+ u29 = _mm_sub_epi16(v30, v29);
+ u30 = _mm_add_epi16(v29, v30);
+ u31 = _mm_add_epi16(v28, v31);
+
+ {
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
+ butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
+ butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
+ butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
+ }
+
+ out[0] = _mm_add_epi16(u16, u23);
+ out[1] = _mm_add_epi16(u17, u22);
+ out[2] = _mm_add_epi16(u18, u21);
+ out[3] = _mm_add_epi16(u19, u20);
+ out[4] = _mm_sub_epi16(u19, u20);
+ out[5] = _mm_sub_epi16(u18, u21);
+ out[6] = _mm_sub_epi16(u17, u22);
+ out[7] = _mm_sub_epi16(u16, u23);
+
+ out[8] = _mm_sub_epi16(u31, u24);
+ out[9] = _mm_sub_epi16(u30, u25);
+ out[10] = _mm_sub_epi16(u29, u26);
+ out[11] = _mm_sub_epi16(u28, u27);
+ out[12] = _mm_add_epi16(u27, u28);
+ out[13] = _mm_add_epi16(u26, u29);
+ out[14] = _mm_add_epi16(u25, u30);
+ out[15] = _mm_add_epi16(u24, u31);
+
+ {
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0);
+ butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0);
+ butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0);
+ butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0);
+ }
+}
+
+static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[32]*/) {
+ __m128i temp[16];
+ idct32_full_8x32_quarter_1(in, temp);
+ idct32_full_8x32_quarter_2(in, &temp[8]);
+ add_sub_butterfly(temp, out, 16);
+}
+
+static void idct32_full_8x32(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[32]*/) {
+ __m128i temp[32];
+ idct32_full_8x32_quarter_1_2(in, temp);
+ idct32_full_8x32_quarter_3_4(in, &temp[16]);
+ add_sub_butterfly(temp, out, 32);
+}
+
+static void load_buffer_8x32(const tran_low_t *input, __m128i *in) {
+ int i;
+ for (i = 0; i < 8; ++i) {
+ in[i] = load_input_data(input);
+ in[i + 8] = load_input_data(input + 8);
+ in[i + 16] = load_input_data(input + 16);
+ in[i + 24] = load_input_data(input + 24);
+ input += 32;
+ }
+}
+
+void aom_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i col[128], in[32];
+ int i, j;
+
+ // rows
+ for (i = 0; i < 4; ++i) {
+ load_buffer_8x32(input, in);
+ input += 32 << 3;
+
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(in, in);
+ array_transpose_8x8(in + 8, in + 8);
+ array_transpose_8x8(in + 16, in + 16);
+ array_transpose_8x8(in + 24, in + 24);
+
+ idct32_full_8x32(in, col + (i << 5));
+ }
+
+ // columns
+ for (i = 0; i < 4; ++i) {
+ j = i << 3;
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(col + j, in);
+ array_transpose_8x8(col + j + 32, in + 8);
+ array_transpose_8x8(col + j + 64, in + 16);
+ array_transpose_8x8(col + j + 96, in + 24);
+
+ idct32_full_8x32(in, in);
+ store_buffer_8x32(in, dest, stride);
+ dest += 8;
+ }
+}
diff --git a/aom_dsp/x86/inv_txfm_ssse3_x86_64.asm b/aom_dsp/x86/inv_txfm_ssse3_x86_64.asm
deleted file mode 100644
index 11d54c7..0000000
--- a/aom_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ /dev/null
@@ -1,1796 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-; This file provides SSSE3 version of the inverse transformation. Part
-; of the functions are originally derived from the ffmpeg project.
-; Note that the current version applies to x86 64-bit only.
-
-SECTION_RODATA
-
-pw_11585x2: times 8 dw 23170
-
-pw_m2404x2: times 8 dw -2404*2
-pw_m4756x2: times 8 dw -4756*2
-pw_m5520x2: times 8 dw -5520*2
-pw_m8423x2: times 8 dw -8423*2
-pw_m9102x2: times 8 dw -9102*2
-pw_m10394x2: times 8 dw -10394*2
-pw_m11003x2: times 8 dw -11003*2
-
-pw_16364x2: times 8 dw 16364*2
-pw_16305x2: times 8 dw 16305*2
-pw_16207x2: times 8 dw 16207*2
-pw_16069x2: times 8 dw 16069*2
-pw_15893x2: times 8 dw 15893*2
-pw_15679x2: times 8 dw 15679*2
-pw_15426x2: times 8 dw 15426*2
-pw_15137x2: times 8 dw 15137*2
-pw_14811x2: times 8 dw 14811*2
-pw_14449x2: times 8 dw 14449*2
-pw_14053x2: times 8 dw 14053*2
-pw_13623x2: times 8 dw 13623*2
-pw_13160x2: times 8 dw 13160*2
-pw_12665x2: times 8 dw 12665*2
-pw_12140x2: times 8 dw 12140*2
-pw__9760x2: times 8 dw 9760*2
-pw__7723x2: times 8 dw 7723*2
-pw__7005x2: times 8 dw 7005*2
-pw__6270x2: times 8 dw 6270*2
-pw__3981x2: times 8 dw 3981*2
-pw__3196x2: times 8 dw 3196*2
-pw__1606x2: times 8 dw 1606*2
-pw___804x2: times 8 dw 804*2
-
-pd_8192: times 4 dd 8192
-pw_32: times 8 dw 32
-pw_16: times 8 dw 16
-
-%macro TRANSFORM_COEFFS 2
-pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2
-pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1
-pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2
-%endmacro
-
-TRANSFORM_COEFFS 6270, 15137
-TRANSFORM_COEFFS 3196, 16069
-TRANSFORM_COEFFS 13623, 9102
-
-; constants for 32x32_34
-TRANSFORM_COEFFS 804, 16364
-TRANSFORM_COEFFS 15426, 5520
-TRANSFORM_COEFFS 3981, 15893
-TRANSFORM_COEFFS 16207, 2404
-TRANSFORM_COEFFS 1606, 16305
-TRANSFORM_COEFFS 15679, 4756
-TRANSFORM_COEFFS 11585, 11585
-
-; constants for 32x32_1024
-TRANSFORM_COEFFS 12140, 11003
-TRANSFORM_COEFFS 7005, 14811
-TRANSFORM_COEFFS 14053, 8423
-TRANSFORM_COEFFS 9760, 13160
-TRANSFORM_COEFFS 12665, 10394
-TRANSFORM_COEFFS 7723, 14449
-
-%macro PAIR_PP_COEFFS 2
-dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2
-%endmacro
-
-%macro PAIR_MP_COEFFS 2
-dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2
-%endmacro
-
-%macro PAIR_MM_COEFFS 2
-dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
-%endmacro
-
-PAIR_PP_COEFFS 30274, 12540
-PAIR_PP_COEFFS 6392, 32138
-PAIR_MP_COEFFS 18204, 27246
-
-PAIR_PP_COEFFS 12540, 12540
-PAIR_PP_COEFFS 30274, 30274
-PAIR_PP_COEFFS 6392, 6392
-PAIR_PP_COEFFS 32138, 32138
-PAIR_MM_COEFFS 18204, 18204
-PAIR_PP_COEFFS 27246, 27246
-
-SECTION .text
-
-%if ARCH_X86_64
-%macro SUM_SUB 3
- psubw m%3, m%1, m%2
- paddw m%1, m%2
- SWAP %2, %3
-%endmacro
-
-; butterfly operation
-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
- pmaddwd m%1, m%3, %5
- pmaddwd m%2, m%3, %6
- paddd m%1, %4
- paddd m%2, %4
- psrad m%1, 14
- psrad m%2, 14
-%endmacro
-
-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
- punpckhwd m%6, m%2, m%1
- MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4]
- punpcklwd m%2, m%1
- MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4]
- packssdw m%1, m%7
- packssdw m%2, m%6
-%endmacro
-
-%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
- punpckhwd m%6, m%2, m%1
- MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_m%3_m%4]
- punpcklwd m%2, m%1
- MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_m%3_m%4]
- packssdw m%1, m%7
- packssdw m%2, m%6
-%endmacro
-
-; matrix transpose
-%macro INTERLEAVE_2X 4
- punpckh%1 m%4, m%2, m%3
- punpckl%1 m%2, m%3
- SWAP %3, %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
- INTERLEAVE_2X wd, %1, %2, %9
- INTERLEAVE_2X wd, %3, %4, %9
- INTERLEAVE_2X wd, %5, %6, %9
- INTERLEAVE_2X wd, %7, %8, %9
-
- INTERLEAVE_2X dq, %1, %3, %9
- INTERLEAVE_2X dq, %2, %4, %9
- INTERLEAVE_2X dq, %5, %7, %9
- INTERLEAVE_2X dq, %6, %8, %9
-
- INTERLEAVE_2X qdq, %1, %5, %9
- INTERLEAVE_2X qdq, %3, %7, %9
- INTERLEAVE_2X qdq, %2, %6, %9
- INTERLEAVE_2X qdq, %4, %8, %9
-
- SWAP %2, %5
- SWAP %4, %7
-%endmacro
-
-%macro IDCT8_1D 0
- SUM_SUB 0, 4, 9
- BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10
- pmulhrsw m0, m12
- pmulhrsw m4, m12
- BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10
- BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10
-
- SUM_SUB 1, 5, 9
- SUM_SUB 7, 3, 9
- SUM_SUB 0, 6, 9
- SUM_SUB 4, 2, 9
- SUM_SUB 3, 5, 9
- pmulhrsw m3, m12
- pmulhrsw m5, m12
-
- SUM_SUB 0, 7, 9
- SUM_SUB 4, 3, 9
- SUM_SUB 2, 5, 9
- SUM_SUB 6, 1, 9
-
- SWAP 3, 6
- SWAP 1, 4
-%endmacro
-
-; This macro handles 8 pixels per line
-%macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero
- paddw m%1, m11
- paddw m%2, m11
- psraw m%1, 5
- psraw m%2, 5
-
- movh m%3, [outputq]
- movh m%4, [outputq + strideq]
- punpcklbw m%3, m%5
- punpcklbw m%4, m%5
- paddw m%3, m%1
- paddw m%4, m%2
- packuswb m%3, m%5
- packuswb m%4, m%5
- movh [outputq], m%3
- movh [outputq + strideq], m%4
-%endmacro
-
-INIT_XMM ssse3
-; full inverse 8x8 2D-DCT transform
-cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
- mova m8, [pd_8192]
- mova m11, [pw_16]
- mova m12, [pw_11585x2]
-
- lea r3, [2 * strideq]
-%if CONFIG_HIGHBITDEPTH
- mova m0, [inputq + 0]
- packssdw m0, [inputq + 16]
- mova m1, [inputq + 32]
- packssdw m1, [inputq + 48]
- mova m2, [inputq + 64]
- packssdw m2, [inputq + 80]
- mova m3, [inputq + 96]
- packssdw m3, [inputq + 112]
- mova m4, [inputq + 128]
- packssdw m4, [inputq + 144]
- mova m5, [inputq + 160]
- packssdw m5, [inputq + 176]
- mova m6, [inputq + 192]
- packssdw m6, [inputq + 208]
- mova m7, [inputq + 224]
- packssdw m7, [inputq + 240]
-%else
- mova m0, [inputq + 0]
- mova m1, [inputq + 16]
- mova m2, [inputq + 32]
- mova m3, [inputq + 48]
- mova m4, [inputq + 64]
- mova m5, [inputq + 80]
- mova m6, [inputq + 96]
- mova m7, [inputq + 112]
-%endif
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
- IDCT8_1D
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
- IDCT8_1D
-
- pxor m12, m12
- ADD_STORE_8P_2X 0, 1, 9, 10, 12
- lea outputq, [outputq + r3]
- ADD_STORE_8P_2X 2, 3, 9, 10, 12
- lea outputq, [outputq + r3]
- ADD_STORE_8P_2X 4, 5, 9, 10, 12
- lea outputq, [outputq + r3]
- ADD_STORE_8P_2X 6, 7, 9, 10, 12
-
- RET
-
-; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
-cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
- mova m8, [pd_8192]
- mova m11, [pw_16]
- mova m12, [pw_11585x2]
-
- lea r3, [2 * strideq]
-
-%if CONFIG_HIGHBITDEPTH
- mova m0, [inputq + 0]
- packssdw m0, [inputq + 16]
- mova m1, [inputq + 32]
- packssdw m1, [inputq + 48]
- mova m2, [inputq + 64]
- packssdw m2, [inputq + 80]
- mova m3, [inputq + 96]
- packssdw m3, [inputq + 112]
-%else
- mova m0, [inputq + 0]
- mova m1, [inputq + 16]
- mova m2, [inputq + 32]
- mova m3, [inputq + 48]
-%endif
-
- punpcklwd m0, m1
- punpcklwd m2, m3
- punpckhdq m9, m0, m2
- punpckldq m0, m2
- SWAP 2, 9
-
- ; m0 -> [0], [0]
- ; m1 -> [1], [1]
- ; m2 -> [2], [2]
- ; m3 -> [3], [3]
- punpckhqdq m10, m0, m0
- punpcklqdq m0, m0
- punpckhqdq m9, m2, m2
- punpcklqdq m2, m2
- SWAP 1, 10
- SWAP 3, 9
-
- pmulhrsw m0, m12
- pmulhrsw m2, [dpw_30274_12540]
- pmulhrsw m1, [dpw_6392_32138]
- pmulhrsw m3, [dpw_m18204_27246]
-
- SUM_SUB 0, 2, 9
- SUM_SUB 1, 3, 9
-
- punpcklqdq m9, m3, m3
- punpckhqdq m5, m3, m9
-
- SUM_SUB 3, 5, 9
- punpckhqdq m5, m3
- pmulhrsw m5, m12
-
- punpckhqdq m9, m1, m5
- punpcklqdq m1, m5
- SWAP 5, 9
-
- SUM_SUB 0, 5, 9
- SUM_SUB 2, 1, 9
-
- punpckhqdq m3, m0, m0
- punpckhqdq m4, m1, m1
- punpckhqdq m6, m5, m5
- punpckhqdq m7, m2, m2
-
- punpcklwd m0, m3
- punpcklwd m7, m2
- punpcklwd m1, m4
- punpcklwd m6, m5
-
- punpckhdq m4, m0, m7
- punpckldq m0, m7
- punpckhdq m10, m1, m6
- punpckldq m5, m1, m6
-
- punpckhqdq m1, m0, m5
- punpcklqdq m0, m5
- punpckhqdq m3, m4, m10
- punpcklqdq m2, m4, m10
-
-
- pmulhrsw m0, m12
- pmulhrsw m6, m2, [dpw_30274_30274]
- pmulhrsw m4, m2, [dpw_12540_12540]
-
- pmulhrsw m7, m1, [dpw_32138_32138]
- pmulhrsw m1, [dpw_6392_6392]
- pmulhrsw m5, m3, [dpw_m18204_m18204]
- pmulhrsw m3, [dpw_27246_27246]
-
- mova m2, m0
- SUM_SUB 0, 6, 9
- SUM_SUB 2, 4, 9
- SUM_SUB 1, 5, 9
- SUM_SUB 7, 3, 9
-
- SUM_SUB 3, 5, 9
- pmulhrsw m3, m12
- pmulhrsw m5, m12
-
- SUM_SUB 0, 7, 9
- SUM_SUB 2, 3, 9
- SUM_SUB 4, 5, 9
- SUM_SUB 6, 1, 9
-
- SWAP 3, 6
- SWAP 1, 2
- SWAP 2, 4
-
-
- pxor m12, m12
- ADD_STORE_8P_2X 0, 1, 9, 10, 12
- lea outputq, [outputq + r3]
- ADD_STORE_8P_2X 2, 3, 9, 10, 12
- lea outputq, [outputq + r3]
- ADD_STORE_8P_2X 4, 5, 9, 10, 12
- lea outputq, [outputq + r3]
- ADD_STORE_8P_2X 6, 7, 9, 10, 12
-
- RET
-
-%define idx0 16 * 0
-%define idx1 16 * 1
-%define idx2 16 * 2
-%define idx3 16 * 3
-%define idx4 16 * 4
-%define idx5 16 * 5
-%define idx6 16 * 6
-%define idx7 16 * 7
-%define idx8 16 * 0
-%define idx9 16 * 1
-%define idx10 16 * 2
-%define idx11 16 * 3
-%define idx12 16 * 4
-%define idx13 16 * 5
-%define idx14 16 * 6
-%define idx15 16 * 7
-%define idx16 16 * 0
-%define idx17 16 * 1
-%define idx18 16 * 2
-%define idx19 16 * 3
-%define idx20 16 * 4
-%define idx21 16 * 5
-%define idx22 16 * 6
-%define idx23 16 * 7
-%define idx24 16 * 0
-%define idx25 16 * 1
-%define idx26 16 * 2
-%define idx27 16 * 3
-%define idx28 16 * 4
-%define idx29 16 * 5
-%define idx30 16 * 6
-%define idx31 16 * 7
-
-; FROM idct32x32_add_neon.asm
-;
-; Instead of doing the transforms stage by stage, it is done by loading
-; some input values and doing as many stages as possible to minimize the
-; storing/loading of intermediate results. To fit within registers, the
-; final coefficients are cut into four blocks:
-; BLOCK A: 16-19,28-31
-; BLOCK B: 20-23,24-27
-; BLOCK C: 8-11,12-15
-; BLOCK D: 0-3,4-7
-; Blocks A and C are straight calculation through the various stages. In
-; block B, further calculations are performed using the results from
-; block A. In block D, further calculations are performed using the results
-; from block C and then the final calculations are done using results from
-; block A and B which have been combined at the end of block B.
-;
-
-%macro IDCT32X32_34 4
- ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m11, m1
- pmulhrsw m1, [pw___804x2] ; stp1_16
- mova [r4 + 0], m0
- pmulhrsw m11, [pw_16364x2] ; stp2_31
- mova [r4 + 16 * 2], m2
- mova m12, m7
- pmulhrsw m7, [pw_15426x2] ; stp1_28
- mova [r4 + 16 * 4], m4
- pmulhrsw m12, [pw_m5520x2] ; stp2_19
- mova [r4 + 16 * 6], m6
-
- ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m2, m1 ; stp1_16
- mova m0, m11 ; stp1_31
- mova m4, m7 ; stp1_28
- mova m15, m12 ; stp1_19
-
- ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
- BUTTERFLY_4Xmm 4, 15, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
-
- ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
- SUM_SUB 0, 15, 9 ; stp2_17, stp2_18
- SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
- SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
-
- ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 4, 15, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
- BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
-
- ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m6, m5
- pmulhrsw m5, [pw__3981x2] ; stp1_20
- mova [stp + %4 + idx28], m12
- mova [stp + %4 + idx29], m15
- pmulhrsw m6, [pw_15893x2] ; stp2_27
- mova [stp + %4 + idx30], m2
- mova m2, m3
- pmulhrsw m3, [pw_m2404x2] ; stp1_23
- mova [stp + %4 + idx31], m11
- pmulhrsw m2, [pw_16207x2] ; stp2_24
-
- ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m13, m5 ; stp1_20
- mova m14, m6 ; stp1_27
- mova m15, m3 ; stp1_23
- mova m11, m2 ; stp1_24
-
- ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
- BUTTERFLY_4Xmm 11, 15, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
-
- ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
- SUM_SUB 15, 14, 9 ; stp2_22, stp2_21
- SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
- SUM_SUB 11, 13, 9 ; stp2_25, stp2_26
-
- ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
- BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
-
- ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 1, 3, 9 ; stp2_16, stp2_23
- SUM_SUB 0, 15, 9 ; stp2_17, stp2_22
- SUM_SUB 4, 14, 9 ; stp2_18, stp2_21
- SUM_SUB 7, 5, 9 ; stp2_19, stp2_20
- mova [stp + %3 + idx16], m1
- mova [stp + %3 + idx17], m0
- mova [stp + %3 + idx18], m4
- mova [stp + %3 + idx19], m7
-
- mova m4, [stp + %4 + idx28]
- mova m7, [stp + %4 + idx29]
- mova m10, [stp + %4 + idx30]
- mova m12, [stp + %4 + idx31]
- SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
- SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
- SUM_SUB 10, 11, 9 ; stp2_30, stp2_25
- SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
- mova [stp + %4 + idx28], m4
- mova [stp + %4 + idx29], m7
- mova [stp + %4 + idx30], m10
- mova [stp + %4 + idx31], m12
-
- ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 6, 5, 9
- pmulhrsw m6, m10 ; stp1_27
- pmulhrsw m5, m10 ; stp1_20
- SUM_SUB 13, 14, 9
- pmulhrsw m13, m10 ; stp1_26
- pmulhrsw m14, m10 ; stp1_21
- SUM_SUB 11, 15, 9
- pmulhrsw m11, m10 ; stp1_25
- pmulhrsw m15, m10 ; stp1_22
- SUM_SUB 2, 3, 9
- pmulhrsw m2, m10 ; stp1_24
- pmulhrsw m3, m10 ; stp1_23
-%else
- BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
- SWAP 6, 5
- BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
- SWAP 13, 14
- BUTTERFLY_4X 11, 15, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
- SWAP 11, 15
- BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
- SWAP 2, 3
-%endif
-
- mova [stp + %4 + idx24], m2
- mova [stp + %4 + idx25], m11
- mova [stp + %4 + idx26], m13
- mova [stp + %4 + idx27], m6
-
- ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m0, [rsp + transposed_in + 16 * 2]
- mova m6, [rsp + transposed_in + 16 * 6]
-
- mova m1, m0
- pmulhrsw m0, [pw__1606x2] ; stp1_8
- mova [stp + %3 + idx20], m5
- mova [stp + %3 + idx21], m14
- pmulhrsw m1, [pw_16305x2] ; stp2_15
- mova [stp + %3 + idx22], m15
- mova m7, m6
- pmulhrsw m7, [pw_m4756x2] ; stp2_11
- mova [stp + %3 + idx23], m3
- pmulhrsw m6, [pw_15679x2] ; stp1_12
-
- ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m3, m0 ; stp1_8
- mova m2, m1 ; stp1_15
-
- ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
- mova m4, m7 ; stp1_11
- mova m5, m6 ; stp1_12
- BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
-
- ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
- SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
- SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
- SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
-
- ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 5, 4, 9
- pmulhrsw m5, m10 ; stp1_13
- pmulhrsw m4, m10 ; stp1_10
- SUM_SUB 6, 7, 9
- pmulhrsw m6, m10 ; stp1_12
- pmulhrsw m7, m10 ; stp1_11
-%else
- BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
- SWAP 5, 4
- BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
- SWAP 6, 7
-%endif
-
- ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova [stp + %2 + idx8], m0
- mova [stp + %2 + idx9], m2
- mova [stp + %2 + idx10], m4
- mova [stp + %2 + idx11], m7
-
- ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m11, [rsp + transposed_in + 16 * 4]
- mova m12, m11
- pmulhrsw m11, [pw__3196x2] ; stp1_4
- pmulhrsw m12, [pw_16069x2] ; stp1_7
-
- ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m0, [rsp + transposed_in + 16 * 0]
- mova m10, [pw_11585x2]
- pmulhrsw m0, m10 ; stp1_1
-
- mova m14, m11 ; stp1_4
- mova m13, m12 ; stp1_7
-
- ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- SUM_SUB 13, 14, 9
- pmulhrsw m13, m10 ; stp1_6
- pmulhrsw m14, m10 ; stp1_5
-%else
- BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
- SWAP 13, 14
-%endif
- mova m7, m0 ; stp1_0 = stp1_1
- mova m4, m0 ; stp1_1
- mova m2, m7 ; stp1_0
-
- ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
- SUM_SUB 7, 13, 9 ; stp1_1, stp1_6
- SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
- SUM_SUB 4, 11, 9 ; stp1_3, stp1_4
-
- ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 1, 9 ; stp1_0, stp1_15
- SUM_SUB 7, 3, 9 ; stp1_1, stp1_14
- SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
- SUM_SUB 4, 6, 9 ; stp1_3, stp1_12
-
- ; 0-3, 28-31 final stage
- mova m15, [stp + %4 + idx30]
- mova m10, [stp + %4 + idx31]
- SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
- SUM_SUB 7, 15, 9 ; stp1_1, stp1_30
- mova [stp + %1 + idx0], m0
- mova [stp + %1 + idx1], m7
- mova [stp + %4 + idx30], m15
- mova [stp + %4 + idx31], m10
- mova m7, [stp + %4 + idx28]
- mova m0, [stp + %4 + idx29]
- SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
- SUM_SUB 4, 7, 9 ; stp1_3, stp1_28
- mova [stp + %1 + idx2], m2
- mova [stp + %1 + idx3], m4
- mova [stp + %4 + idx28], m7
- mova [stp + %4 + idx29], m0
-
- ; 12-15, 16-19 final stage
- mova m0, [stp + %3 + idx16]
- mova m7, [stp + %3 + idx17]
- mova m2, [stp + %3 + idx18]
- mova m4, [stp + %3 + idx19]
- SUM_SUB 1, 0, 9 ; stp1_15, stp1_16
- SUM_SUB 3, 7, 9 ; stp1_14, stp1_17
- SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
- SUM_SUB 6, 4, 9 ; stp1_12, stp1_19
- mova [stp + %2 + idx12], m6
- mova [stp + %2 + idx13], m5
- mova [stp + %2 + idx14], m3
- mova [stp + %2 + idx15], m1
- mova [stp + %3 + idx16], m0
- mova [stp + %3 + idx17], m7
- mova [stp + %3 + idx18], m2
- mova [stp + %3 + idx19], m4
-
- mova m4, [stp + %2 + idx8]
- mova m5, [stp + %2 + idx9]
- mova m6, [stp + %2 + idx10]
- mova m7, [stp + %2 + idx11]
- SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
- SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
- SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
- SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
-
- ; 4-7, 24-27 final stage
- mova m0, [stp + %4 + idx27]
- mova m1, [stp + %4 + idx26]
- mova m2, [stp + %4 + idx25]
- mova m3, [stp + %4 + idx24]
- SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
- SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
- SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
- SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
- mova [stp + %4 + idx27], m0
- mova [stp + %4 + idx26], m1
- mova [stp + %4 + idx25], m2
- mova [stp + %4 + idx24], m3
- mova [stp + %1 + idx4], m11
- mova [stp + %1 + idx5], m14
- mova [stp + %1 + idx6], m13
- mova [stp + %1 + idx7], m12
-
- ; 8-11, 20-23 final stage
- mova m0, [stp + %3 + idx20]
- mova m1, [stp + %3 + idx21]
- mova m2, [stp + %3 + idx22]
- mova m3, [stp + %3 + idx23]
- SUM_SUB 7, 0, 9 ; stp1_11, stp_20
- SUM_SUB 6, 1, 9 ; stp1_10, stp_21
- SUM_SUB 5, 2, 9 ; stp1_9, stp_22
- SUM_SUB 4, 3, 9 ; stp1_8, stp_23
- mova [stp + %2 + idx8], m4
- mova [stp + %2 + idx9], m5
- mova [stp + %2 + idx10], m6
- mova [stp + %2 + idx11], m7
- mova [stp + %3 + idx20], m0
- mova [stp + %3 + idx21], m1
- mova [stp + %3 + idx22], m2
- mova [stp + %3 + idx23], m3
-%endmacro
-
-%macro RECON_AND_STORE 1
- mova m11, [pw_32]
- lea stp, [rsp + %1]
- mov r6, 32
- pxor m8, m8
-%%recon_and_store:
- mova m0, [stp + 16 * 32 * 0]
- mova m1, [stp + 16 * 32 * 1]
- mova m2, [stp + 16 * 32 * 2]
- mova m3, [stp + 16 * 32 * 3]
- add stp, 16
-
- paddw m0, m11
- paddw m1, m11
- paddw m2, m11
- paddw m3, m11
- psraw m0, 6
- psraw m1, 6
- psraw m2, 6
- psraw m3, 6
- movh m4, [outputq + 0]
- movh m5, [outputq + 8]
- movh m6, [outputq + 16]
- movh m7, [outputq + 24]
- punpcklbw m4, m8
- punpcklbw m5, m8
- punpcklbw m6, m8
- punpcklbw m7, m8
- paddw m0, m4
- paddw m1, m5
- paddw m2, m6
- paddw m3, m7
- packuswb m0, m1
- packuswb m2, m3
- mova [outputq + 0], m0
- mova [outputq + 16], m2
- lea outputq, [outputq + strideq]
- dec r6
- jnz %%recon_and_store
-%endmacro
-
-%define i32x32_size 16*32*5
-%define pass_two_start 16*32*0
-%define transposed_in 16*32*4
-%define pass_one_start 16*32*0
-%define stp r8
-
-INIT_XMM ssse3
-cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride
- mova m8, [pd_8192]
- lea stp, [rsp + pass_one_start]
-
-idct32x32_34:
- mov r3, inputq
- lea r4, [rsp + transposed_in]
-
-idct32x32_34_transpose:
-%if CONFIG_HIGHBITDEPTH
- mova m0, [r3 + 0]
- packssdw m0, [r3 + 16]
- mova m1, [r3 + 32 * 4]
- packssdw m1, [r3 + 32 * 4 + 16]
- mova m2, [r3 + 32 * 8]
- packssdw m2, [r3 + 32 * 8 + 16]
- mova m3, [r3 + 32 * 12]
- packssdw m3, [r3 + 32 * 12 + 16]
- mova m4, [r3 + 32 * 16]
- packssdw m4, [r3 + 32 * 16 + 16]
- mova m5, [r3 + 32 * 20]
- packssdw m5, [r3 + 32 * 20 + 16]
- mova m6, [r3 + 32 * 24]
- packssdw m6, [r3 + 32 * 24 + 16]
- mova m7, [r3 + 32 * 28]
- packssdw m7, [r3 + 32 * 28 + 16]
-%else
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 4]
- mova m2, [r3 + 16 * 8]
- mova m3, [r3 + 16 * 12]
- mova m4, [r3 + 16 * 16]
- mova m5, [r3 + 16 * 20]
- mova m6, [r3 + 16 * 24]
- mova m7, [r3 + 16 * 28]
-%endif
-
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- IDCT32X32_34 16*0, 16*32, 16*64, 16*96
- lea stp, [stp + 16 * 8]
- mov r6, 4
- lea stp, [rsp + pass_one_start]
- lea r9, [rsp + pass_one_start]
-
-idct32x32_34_2:
- lea r4, [rsp + transposed_in]
- mov r3, r9
-
-idct32x32_34_transpose_2:
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 1]
- mova m2, [r3 + 16 * 2]
- mova m3, [r3 + 16 * 3]
- mova m4, [r3 + 16 * 4]
- mova m5, [r3 + 16 * 5]
- mova m6, [r3 + 16 * 6]
- mova m7, [r3 + 16 * 7]
-
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- IDCT32X32_34 16*0, 16*8, 16*16, 16*24
-
- lea stp, [stp + 16 * 32]
- add r9, 16 * 32
- dec r6
- jnz idct32x32_34_2
-
- RECON_AND_STORE pass_two_start
-
- RET
-
-%macro IDCT32X32_135 4
- ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m1, [rsp + transposed_in + 16 * 1]
- mova m11, m1
- pmulhrsw m1, [pw___804x2] ; stp1_16
- pmulhrsw m11, [pw_16364x2] ; stp2_31
-
- mova m7, [rsp + transposed_in + 16 * 7]
- mova m12, m7
- pmulhrsw m7, [pw_15426x2] ; stp1_28
- pmulhrsw m12, [pw_m5520x2] ; stp2_19
-
- mova m3, [rsp + transposed_in + 16 * 9]
- mova m4, m3
- pmulhrsw m3, [pw__7005x2] ; stp1_18
- pmulhrsw m4, [pw_14811x2] ; stp2_29
-
- mova m0, [rsp + transposed_in + 16 * 15]
- mova m2, m0
- pmulhrsw m0, [pw_12140x2] ; stp1_30
- pmulhrsw m2, [pw_m11003x2] ; stp2_17
-
- ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 1, 2, 9 ; stp2_16, stp2_17
- SUM_SUB 12, 3, 9 ; stp2_19, stp2_18
- SUM_SUB 7, 4, 9 ; stp2_28, stp2_29
- SUM_SUB 11, 0, 9 ; stp2_31, stp2_30
-
- ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
- BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
-
- ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
- SUM_SUB 0, 3, 9 ; stp2_17, stp2_18
- SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
- SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
-
- ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
- BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
-
- mova [stp + %3 + idx16], m1
- mova [stp + %3 + idx17], m0
- mova [stp + %3 + idx18], m4
- mova [stp + %3 + idx19], m7
- mova [stp + %4 + idx28], m12
- mova [stp + %4 + idx29], m3
- mova [stp + %4 + idx30], m2
- mova [stp + %4 + idx31], m11
-
- ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m2, [rsp + transposed_in + 16 * 3]
- mova m3, m2
- pmulhrsw m3, [pw_m2404x2] ; stp1_23
- pmulhrsw m2, [pw_16207x2] ; stp2_24
-
- mova m5, [rsp + transposed_in + 16 * 5]
- mova m6, m5
- pmulhrsw m5, [pw__3981x2] ; stp1_20
- pmulhrsw m6, [pw_15893x2] ; stp2_27
-
- mova m14, [rsp + transposed_in + 16 * 11]
- mova m13, m14
- pmulhrsw m13, [pw_m8423x2] ; stp1_21
- pmulhrsw m14, [pw_14053x2] ; stp2_26
-
- mova m0, [rsp + transposed_in + 16 * 13]
- mova m1, m0
- pmulhrsw m0, [pw__9760x2] ; stp1_22
- pmulhrsw m1, [pw_13160x2] ; stp2_25
-
- ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 5, 13, 9 ; stp2_20, stp2_21
- SUM_SUB 3, 0, 9 ; stp2_23, stp2_22
- SUM_SUB 2, 1, 9 ; stp2_24, stp2_25
- SUM_SUB 6, 14, 9 ; stp2_27, stp2_26
-
- ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
- BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
-
- ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
- SUM_SUB 0, 14, 9 ; stp2_22, stp2_21
- SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
- SUM_SUB 1, 13, 9 ; stp2_25, stp2_26
-
- ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
- BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
-
- ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m4, [stp + %3 + idx16]
- mova m7, [stp + %3 + idx17]
- mova m11, [stp + %3 + idx18]
- mova m12, [stp + %3 + idx19]
- SUM_SUB 4, 3, 9 ; stp2_16, stp2_23
- SUM_SUB 7, 0, 9 ; stp2_17, stp2_22
- SUM_SUB 11, 14, 9 ; stp2_18, stp2_21
- SUM_SUB 12, 5, 9 ; stp2_19, stp2_20
- mova [stp + %3 + idx16], m4
- mova [stp + %3 + idx17], m7
- mova [stp + %3 + idx18], m11
- mova [stp + %3 + idx19], m12
-
- mova m4, [stp + %4 + idx28]
- mova m7, [stp + %4 + idx29]
- mova m11, [stp + %4 + idx30]
- mova m12, [stp + %4 + idx31]
- SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
- SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
- SUM_SUB 11, 1, 9 ; stp2_30, stp2_25
- SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
- mova [stp + %4 + idx28], m4
- mova [stp + %4 + idx29], m7
- mova [stp + %4 + idx30], m11
- mova [stp + %4 + idx31], m12
-
- ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 6, 5, 9
- pmulhrsw m6, m10 ; stp1_27
- pmulhrsw m5, m10 ; stp1_20
- SUM_SUB 13, 14, 9
- pmulhrsw m13, m10 ; stp1_26
- pmulhrsw m14, m10 ; stp1_21
- SUM_SUB 1, 0, 9
- pmulhrsw m1, m10 ; stp1_25
- pmulhrsw m0, m10 ; stp1_22
- SUM_SUB 2, 3, 9
- pmulhrsw m2, m10 ; stp1_25
- pmulhrsw m3, m10 ; stp1_22
-%else
- BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
- SWAP 6, 5
- BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
- SWAP 13, 14
- BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
- SWAP 1, 0
- BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
- SWAP 2, 3
-%endif
- mova [stp + %3 + idx20], m5
- mova [stp + %3 + idx21], m14
- mova [stp + %3 + idx22], m0
- mova [stp + %3 + idx23], m3
- mova [stp + %4 + idx24], m2
- mova [stp + %4 + idx25], m1
- mova [stp + %4 + idx26], m13
- mova [stp + %4 + idx27], m6
-
- ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m0, [rsp + transposed_in + 16 * 2]
- mova m1, m0
- pmulhrsw m0, [pw__1606x2] ; stp1_8
- pmulhrsw m1, [pw_16305x2] ; stp2_15
-
- mova m6, [rsp + transposed_in + 16 * 6]
- mova m7, m6
- pmulhrsw m7, [pw_m4756x2] ; stp2_11
- pmulhrsw m6, [pw_15679x2] ; stp1_12
-
- mova m4, [rsp + transposed_in + 16 * 10]
- mova m5, m4
- pmulhrsw m4, [pw__7723x2] ; stp1_10
- pmulhrsw m5, [pw_14449x2] ; stp2_13
-
- mova m2, [rsp + transposed_in + 16 * 14]
- mova m3, m2
- pmulhrsw m3, [pw_m10394x2] ; stp1_9
- pmulhrsw m2, [pw_12665x2] ; stp2_14
-
- ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 3, 9 ; stp1_8, stp1_9
- SUM_SUB 7, 4, 9 ; stp1_11, stp1_10
- SUM_SUB 6, 5, 9 ; stp1_12, stp1_13
- SUM_SUB 1, 2, 9 ; stp1_15, stp1_14
-
- ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
- BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
-
- ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
- SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
- SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
- SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
-
- ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 5, 4, 9
- pmulhrsw m5, m10 ; stp1_13
- pmulhrsw m4, m10 ; stp1_10
- SUM_SUB 6, 7, 9
- pmulhrsw m6, m10 ; stp1_12
- pmulhrsw m7, m10 ; stp1_11
-%else
- BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
- SWAP 5, 4
- BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
- SWAP 6, 7
-%endif
- ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova [stp + %2 + idx8], m0
- mova [stp + %2 + idx9], m2
- mova [stp + %2 + idx10], m4
- mova [stp + %2 + idx11], m7
- mova [stp + %2 + idx12], m6
- mova [stp + %2 + idx13], m5
- mova [stp + %2 + idx14], m3
- mova [stp + %2 + idx15], m1
-
- ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m11, [rsp + transposed_in + 16 * 4]
- mova m12, m11
- pmulhrsw m11, [pw__3196x2] ; stp1_4
- pmulhrsw m12, [pw_16069x2] ; stp1_7
-
- mova m13, [rsp + transposed_in + 16 * 12]
- mova m14, m13
- pmulhrsw m13, [pw_13623x2] ; stp1_6
- pmulhrsw m14, [pw_m9102x2] ; stp1_5
-
- ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m0, [rsp + transposed_in + 16 * 0]
- mova m2, [rsp + transposed_in + 16 * 8]
- pmulhrsw m0, [pw_11585x2] ; stp1_1
- mova m3, m2
- pmulhrsw m2, [pw__6270x2] ; stp1_2
- pmulhrsw m3, [pw_15137x2] ; stp1_3
-
- SUM_SUB 11, 14, 9 ; stp1_4, stp1_5
- SUM_SUB 12, 13, 9 ; stp1_7, stp1_6
-
- ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 13, 14, 9
- pmulhrsw m13, m10 ; stp1_6
- pmulhrsw m14, m10 ; stp1_5
-%else
- BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
- SWAP 13, 14
-%endif
- mova m1, m0 ; stp1_0 = stp1_1
- SUM_SUB 0, 3, 9 ; stp1_0, stp1_3
- SUM_SUB 1, 2, 9 ; stp1_1, stp1_2
-
- ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
- SUM_SUB 1, 13, 9 ; stp1_1, stp1_6
- SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
- SUM_SUB 3, 11, 9 ; stp1_3, stp1_4
-
- ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m4, [stp + %2 + idx12]
- mova m5, [stp + %2 + idx13]
- mova m6, [stp + %2 + idx14]
- mova m7, [stp + %2 + idx15]
- SUM_SUB 0, 7, 9 ; stp1_0, stp1_15
- SUM_SUB 1, 6, 9 ; stp1_1, stp1_14
- SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
- SUM_SUB 3, 4, 9 ; stp1_3, stp1_12
-
- ; 0-3, 28-31 final stage
- mova m10, [stp + %4 + idx31]
- mova m15, [stp + %4 + idx30]
- SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
- SUM_SUB 1, 15, 9 ; stp1_1, stp1_30
- mova [stp + %1 + idx0], m0
- mova [stp + %1 + idx1], m1
- mova [stp + %4 + idx31], m10
- mova [stp + %4 + idx30], m15
- mova m0, [stp + %4 + idx29]
- mova m1, [stp + %4 + idx28]
- SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
- SUM_SUB 3, 1, 9 ; stp1_3, stp1_28
- mova [stp + %1 + idx2], m2
- mova [stp + %1 + idx3], m3
- mova [stp + %4 + idx29], m0
- mova [stp + %4 + idx28], m1
-
- ; 12-15, 16-19 final stage
- mova m0, [stp + %3 + idx16]
- mova m1, [stp + %3 + idx17]
- mova m2, [stp + %3 + idx18]
- mova m3, [stp + %3 + idx19]
- SUM_SUB 7, 0, 9 ; stp1_15, stp1_16
- SUM_SUB 6, 1, 9 ; stp1_14, stp1_17
- SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
- SUM_SUB 4, 3, 9 ; stp1_12, stp1_19
- mova [stp + %2 + idx12], m4
- mova [stp + %2 + idx13], m5
- mova [stp + %2 + idx14], m6
- mova [stp + %2 + idx15], m7
- mova [stp + %3 + idx16], m0
- mova [stp + %3 + idx17], m1
- mova [stp + %3 + idx18], m2
- mova [stp + %3 + idx19], m3
-
- mova m4, [stp + %2 + idx8]
- mova m5, [stp + %2 + idx9]
- mova m6, [stp + %2 + idx10]
- mova m7, [stp + %2 + idx11]
- SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
- SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
- SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
- SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
-
- ; 4-7, 24-27 final stage
- mova m3, [stp + %4 + idx24]
- mova m2, [stp + %4 + idx25]
- mova m1, [stp + %4 + idx26]
- mova m0, [stp + %4 + idx27]
- SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
- SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
- SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
- SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
- mova [stp + %4 + idx24], m3
- mova [stp + %4 + idx25], m2
- mova [stp + %4 + idx26], m1
- mova [stp + %4 + idx27], m0
- mova [stp + %1 + idx4], m11
- mova [stp + %1 + idx5], m14
- mova [stp + %1 + idx6], m13
- mova [stp + %1 + idx7], m12
-
- ; 8-11, 20-23 final stage
- mova m0, [stp + %3 + idx20]
- mova m1, [stp + %3 + idx21]
- mova m2, [stp + %3 + idx22]
- mova m3, [stp + %3 + idx23]
- SUM_SUB 7, 0, 9 ; stp1_11, stp_20
- SUM_SUB 6, 1, 9 ; stp1_10, stp_21
- SUM_SUB 5, 2, 9 ; stp1_9, stp_22
- SUM_SUB 4, 3, 9 ; stp1_8, stp_23
- mova [stp + %2 + idx8], m4
- mova [stp + %2 + idx9], m5
- mova [stp + %2 + idx10], m6
- mova [stp + %2 + idx11], m7
- mova [stp + %3 + idx20], m0
- mova [stp + %3 + idx21], m1
- mova [stp + %3 + idx22], m2
- mova [stp + %3 + idx23], m3
-%endmacro
-
-INIT_XMM ssse3
-cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride
- mova m8, [pd_8192]
- mov r6, 2
- lea stp, [rsp + pass_one_start]
-
-idct32x32_135:
- mov r3, inputq
- lea r4, [rsp + transposed_in]
- mov r7, 2
-
-idct32x32_135_transpose:
-%if CONFIG_HIGHBITDEPTH
- mova m0, [r3 + 0]
- packssdw m0, [r3 + 16]
- mova m1, [r3 + 32 * 4]
- packssdw m1, [r3 + 32 * 4 + 16]
- mova m2, [r3 + 32 * 8]
- packssdw m2, [r3 + 32 * 8 + 16]
- mova m3, [r3 + 32 * 12]
- packssdw m3, [r3 + 32 * 12 + 16]
- mova m4, [r3 + 32 * 16]
- packssdw m4, [r3 + 32 * 16 + 16]
- mova m5, [r3 + 32 * 20]
- packssdw m5, [r3 + 32 * 20 + 16]
- mova m6, [r3 + 32 * 24]
- packssdw m6, [r3 + 32 * 24 + 16]
- mova m7, [r3 + 32 * 28]
- packssdw m7, [r3 + 32 * 28 + 16]
-%else
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 4]
- mova m2, [r3 + 16 * 8]
- mova m3, [r3 + 16 * 12]
- mova m4, [r3 + 16 * 16]
- mova m5, [r3 + 16 * 20]
- mova m6, [r3 + 16 * 24]
- mova m7, [r3 + 16 * 28]
-%endif
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- mova [r4 + 0], m0
- mova [r4 + 16 * 1], m1
- mova [r4 + 16 * 2], m2
- mova [r4 + 16 * 3], m3
- mova [r4 + 16 * 4], m4
- mova [r4 + 16 * 5], m5
- mova [r4 + 16 * 6], m6
- mova [r4 + 16 * 7], m7
-
-%if CONFIG_HIGHBITDEPTH
- add r3, 32
-%else
- add r3, 16
-%endif
- add r4, 16 * 8
- dec r7
- jne idct32x32_135_transpose
-
- IDCT32X32_135 16*0, 16*32, 16*64, 16*96
- lea stp, [stp + 16 * 8]
-%if CONFIG_HIGHBITDEPTH
- lea inputq, [inputq + 32 * 32]
-%else
- lea inputq, [inputq + 16 * 32]
-%endif
- dec r6
- jnz idct32x32_135
-
- mov r6, 4
- lea stp, [rsp + pass_one_start]
- lea r9, [rsp + pass_one_start]
-
-idct32x32_135_2:
- lea r4, [rsp + transposed_in]
- mov r3, r9
- mov r7, 2
-
-idct32x32_135_transpose_2:
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 1]
- mova m2, [r3 + 16 * 2]
- mova m3, [r3 + 16 * 3]
- mova m4, [r3 + 16 * 4]
- mova m5, [r3 + 16 * 5]
- mova m6, [r3 + 16 * 6]
- mova m7, [r3 + 16 * 7]
-
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- mova [r4 + 0], m0
- mova [r4 + 16 * 1], m1
- mova [r4 + 16 * 2], m2
- mova [r4 + 16 * 3], m3
- mova [r4 + 16 * 4], m4
- mova [r4 + 16 * 5], m5
- mova [r4 + 16 * 6], m6
- mova [r4 + 16 * 7], m7
-
- add r3, 16 * 8
- add r4, 16 * 8
- dec r7
- jne idct32x32_135_transpose_2
-
- IDCT32X32_135 16*0, 16*8, 16*16, 16*24
-
- lea stp, [stp + 16 * 32]
- add r9, 16 * 32
- dec r6
- jnz idct32x32_135_2
-
- RECON_AND_STORE pass_two_start
-
- RET
-
-%macro IDCT32X32_1024 4
- ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m1, [rsp + transposed_in + 16 * 1]
- mova m11, [rsp + transposed_in + 16 * 31]
- BUTTERFLY_4X 1, 11, 804, 16364, m8, 9, 10 ; stp1_16, stp1_31
-
- mova m0, [rsp + transposed_in + 16 * 15]
- mova m2, [rsp + transposed_in + 16 * 17]
- BUTTERFLY_4X 2, 0, 12140, 11003, m8, 9, 10 ; stp1_17, stp1_30
-
- mova m7, [rsp + transposed_in + 16 * 7]
- mova m12, [rsp + transposed_in + 16 * 25]
- BUTTERFLY_4X 12, 7, 15426, 5520, m8, 9, 10 ; stp1_19, stp1_28
-
- mova m3, [rsp + transposed_in + 16 * 9]
- mova m4, [rsp + transposed_in + 16 * 23]
- BUTTERFLY_4X 3, 4, 7005, 14811, m8, 9, 10 ; stp1_18, stp1_29
-
- ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 1, 2, 9 ; stp2_16, stp2_17
- SUM_SUB 12, 3, 9 ; stp2_19, stp2_18
- SUM_SUB 7, 4, 9 ; stp2_28, stp2_29
- SUM_SUB 11, 0, 9 ; stp2_31, stp2_30
-
- ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
- BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
-
- ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
- SUM_SUB 0, 3, 9 ; stp2_17, stp2_18
- SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
- SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
-
- ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
- BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
-
- mova [stp + %3 + idx16], m1
- mova [stp + %3 + idx17], m0
- mova [stp + %3 + idx18], m4
- mova [stp + %3 + idx19], m7
- mova [stp + %4 + idx28], m12
- mova [stp + %4 + idx29], m3
- mova [stp + %4 + idx30], m2
- mova [stp + %4 + idx31], m11
-
- ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m5, [rsp + transposed_in + 16 * 5]
- mova m6, [rsp + transposed_in + 16 * 27]
- BUTTERFLY_4X 5, 6, 3981, 15893, m8, 9, 10 ; stp1_20, stp1_27
-
- mova m13, [rsp + transposed_in + 16 * 21]
- mova m14, [rsp + transposed_in + 16 * 11]
- BUTTERFLY_4X 13, 14, 14053, 8423, m8, 9, 10 ; stp1_21, stp1_26
-
- mova m0, [rsp + transposed_in + 16 * 13]
- mova m1, [rsp + transposed_in + 16 * 19]
- BUTTERFLY_4X 0, 1, 9760, 13160, m8, 9, 10 ; stp1_22, stp1_25
-
- mova m2, [rsp + transposed_in + 16 * 3]
- mova m3, [rsp + transposed_in + 16 * 29]
- BUTTERFLY_4X 3, 2, 16207, 2404, m8, 9, 10 ; stp1_23, stp1_24
-
- ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 5, 13, 9 ; stp2_20, stp2_21
- SUM_SUB 3, 0, 9 ; stp2_23, stp2_22
- SUM_SUB 2, 1, 9 ; stp2_24, stp2_25
- SUM_SUB 6, 14, 9 ; stp2_27, stp2_26
-
- ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
- BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
-
- ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
- SUM_SUB 0, 14, 9 ; stp2_22, stp2_21
- SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
- SUM_SUB 1, 13, 9 ; stp2_25, stp2_26
-
- ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
- BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
-
- ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m4, [stp + %3 + idx16]
- mova m7, [stp + %3 + idx17]
- mova m11, [stp + %3 + idx18]
- mova m12, [stp + %3 + idx19]
- SUM_SUB 4, 3, 9 ; stp2_16, stp2_23
- SUM_SUB 7, 0, 9 ; stp2_17, stp2_22
- SUM_SUB 11, 14, 9 ; stp2_18, stp2_21
- SUM_SUB 12, 5, 9 ; stp2_19, stp2_20
- mova [stp + %3 + idx16], m4
- mova [stp + %3 + idx17], m7
- mova [stp + %3 + idx18], m11
- mova [stp + %3 + idx19], m12
-
- mova m4, [stp + %4 + idx28]
- mova m7, [stp + %4 + idx29]
- mova m11, [stp + %4 + idx30]
- mova m12, [stp + %4 + idx31]
- SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
- SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
- SUM_SUB 11, 1, 9 ; stp2_30, stp2_25
- SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
- mova [stp + %4 + idx28], m4
- mova [stp + %4 + idx29], m7
- mova [stp + %4 + idx30], m11
- mova [stp + %4 + idx31], m12
-
- ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 6, 5, 9
- pmulhrsw m6, m10 ; stp1_27
- pmulhrsw m5, m10 ; stp1_20
- SUM_SUB 13, 14, 9
- pmulhrsw m13, m10 ; stp1_26
- pmulhrsw m14, m10 ; stp1_21
- SUM_SUB 1, 0, 9
- pmulhrsw m1, m10 ; stp1_25
- pmulhrsw m0, m10 ; stp1_22
- SUM_SUB 2, 3, 9
- pmulhrsw m2, m10 ; stp1_25
- pmulhrsw m3, m10 ; stp1_22
-%else
- BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
- SWAP 6, 5
- BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
- SWAP 13, 14
- BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
- SWAP 1, 0
- BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
- SWAP 2, 3
-%endif
- mova [stp + %3 + idx20], m5
- mova [stp + %3 + idx21], m14
- mova [stp + %3 + idx22], m0
- mova [stp + %3 + idx23], m3
- mova [stp + %4 + idx24], m2
- mova [stp + %4 + idx25], m1
- mova [stp + %4 + idx26], m13
- mova [stp + %4 + idx27], m6
-
- ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m0, [rsp + transposed_in + 16 * 2]
- mova m1, [rsp + transposed_in + 16 * 30]
- BUTTERFLY_4X 0, 1, 1606, 16305, m8, 9, 10 ; stp1_8, stp1_15
-
- mova m2, [rsp + transposed_in + 16 * 14]
- mova m3, [rsp + transposed_in + 16 * 18]
- BUTTERFLY_4X 3, 2, 12665, 10394, m8, 9, 10 ; stp1_9, stp1_14
-
- mova m4, [rsp + transposed_in + 16 * 10]
- mova m5, [rsp + transposed_in + 16 * 22]
- BUTTERFLY_4X 4, 5, 7723, 14449, m8, 9, 10 ; stp1_10, stp1_13
-
- mova m6, [rsp + transposed_in + 16 * 6]
- mova m7, [rsp + transposed_in + 16 * 26]
- BUTTERFLY_4X 7, 6, 15679, 4756, m8, 9, 10 ; stp1_11, stp1_12
-
- ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 3, 9 ; stp1_8, stp1_9
- SUM_SUB 7, 4, 9 ; stp1_11, stp1_10
- SUM_SUB 6, 5, 9 ; stp1_12, stp1_13
- SUM_SUB 1, 2, 9 ; stp1_15, stp1_14
-
- ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
- BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
-
- ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
- SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
- SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
- SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
-
- ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 5, 4, 9
- pmulhrsw m5, m10 ; stp1_13
- pmulhrsw m4, m10 ; stp1_10
- SUM_SUB 6, 7, 9
- pmulhrsw m6, m10 ; stp1_12
- pmulhrsw m7, m10 ; stp1_11
-%else
- BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
- SWAP 5, 4
- BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
- SWAP 6, 7
-%endif
- ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova [stp + %2 + idx8], m0
- mova [stp + %2 + idx9], m2
- mova [stp + %2 + idx10], m4
- mova [stp + %2 + idx11], m7
- mova [stp + %2 + idx12], m6
- mova [stp + %2 + idx13], m5
- mova [stp + %2 + idx14], m3
- mova [stp + %2 + idx15], m1
-
- ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m11, [rsp + transposed_in + 16 * 4]
- mova m12, [rsp + transposed_in + 16 * 28]
- BUTTERFLY_4X 11, 12, 3196, 16069, m8, 9, 10 ; stp1_4, stp1_7
-
- mova m13, [rsp + transposed_in + 16 * 12]
- mova m14, [rsp + transposed_in + 16 * 20]
- BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_5, stp1_6
-
- ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m0, [rsp + transposed_in + 16 * 0]
- mova m1, [rsp + transposed_in + 16 * 16]
-
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 0, 1, 9
- pmulhrsw m0, m10 ; stp1_1
- pmulhrsw m1, m10 ; stp1_0
-%else
- BUTTERFLY_4X 0, 1, 11585, 11585, m8, 9, 10 ; stp1_1, stp1_0
- SWAP 0, 1
-%endif
- mova m2, [rsp + transposed_in + 16 * 8]
- mova m3, [rsp + transposed_in + 16 * 24]
- BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_2, stp1_3
-
- mova m10, [pw_11585x2]
- SUM_SUB 11, 14, 9 ; stp1_4, stp1_5
- SUM_SUB 12, 13, 9 ; stp1_7, stp1_6
-
- ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- SUM_SUB 13, 14, 9
- pmulhrsw m13, m10 ; stp1_6
- pmulhrsw m14, m10 ; stp1_5
-%else
- BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
- SWAP 13, 14
-%endif
- SUM_SUB 0, 3, 9 ; stp1_0, stp1_3
- SUM_SUB 1, 2, 9 ; stp1_1, stp1_2
-
- ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
- SUM_SUB 1, 13, 9 ; stp1_1, stp1_6
- SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
- SUM_SUB 3, 11, 9 ; stp1_3, stp1_4
-
- ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m4, [stp + %2 + idx12]
- mova m5, [stp + %2 + idx13]
- mova m6, [stp + %2 + idx14]
- mova m7, [stp + %2 + idx15]
- SUM_SUB 0, 7, 9 ; stp1_0, stp1_15
- SUM_SUB 1, 6, 9 ; stp1_1, stp1_14
- SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
- SUM_SUB 3, 4, 9 ; stp1_3, stp1_12
-
- ; 0-3, 28-31 final stage
- mova m10, [stp + %4 + idx31]
- mova m15, [stp + %4 + idx30]
- SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
- SUM_SUB 1, 15, 9 ; stp1_1, stp1_30
- mova [stp + %1 + idx0], m0
- mova [stp + %1 + idx1], m1
- mova [stp + %4 + idx31], m10
- mova [stp + %4 + idx30], m15
- mova m0, [stp + %4 + idx29]
- mova m1, [stp + %4 + idx28]
- SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
- SUM_SUB 3, 1, 9 ; stp1_3, stp1_28
- mova [stp + %1 + idx2], m2
- mova [stp + %1 + idx3], m3
- mova [stp + %4 + idx29], m0
- mova [stp + %4 + idx28], m1
-
- ; 12-15, 16-19 final stage
- mova m0, [stp + %3 + idx16]
- mova m1, [stp + %3 + idx17]
- mova m2, [stp + %3 + idx18]
- mova m3, [stp + %3 + idx19]
- SUM_SUB 7, 0, 9 ; stp1_15, stp1_16
- SUM_SUB 6, 1, 9 ; stp1_14, stp1_17
- SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
- SUM_SUB 4, 3, 9 ; stp1_12, stp1_19
- mova [stp + %2 + idx12], m4
- mova [stp + %2 + idx13], m5
- mova [stp + %2 + idx14], m6
- mova [stp + %2 + idx15], m7
- mova [stp + %3 + idx16], m0
- mova [stp + %3 + idx17], m1
- mova [stp + %3 + idx18], m2
- mova [stp + %3 + idx19], m3
-
- mova m4, [stp + %2 + idx8]
- mova m5, [stp + %2 + idx9]
- mova m6, [stp + %2 + idx10]
- mova m7, [stp + %2 + idx11]
- SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
- SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
- SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
- SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
-
- ; 4-7, 24-27 final stage
- mova m3, [stp + %4 + idx24]
- mova m2, [stp + %4 + idx25]
- mova m1, [stp + %4 + idx26]
- mova m0, [stp + %4 + idx27]
- SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
- SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
- SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
- SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
- mova [stp + %4 + idx24], m3
- mova [stp + %4 + idx25], m2
- mova [stp + %4 + idx26], m1
- mova [stp + %4 + idx27], m0
- mova [stp + %1 + idx4], m11
- mova [stp + %1 + idx5], m14
- mova [stp + %1 + idx6], m13
- mova [stp + %1 + idx7], m12
-
- ; 8-11, 20-23 final stage
- mova m0, [stp + %3 + idx20]
- mova m1, [stp + %3 + idx21]
- mova m2, [stp + %3 + idx22]
- mova m3, [stp + %3 + idx23]
- SUM_SUB 7, 0, 9 ; stp1_11, stp_20
- SUM_SUB 6, 1, 9 ; stp1_10, stp_21
- SUM_SUB 5, 2, 9 ; stp1_9, stp_22
- SUM_SUB 4, 3, 9 ; stp1_8, stp_23
- mova [stp + %2 + idx8], m4
- mova [stp + %2 + idx9], m5
- mova [stp + %2 + idx10], m6
- mova [stp + %2 + idx11], m7
- mova [stp + %3 + idx20], m0
- mova [stp + %3 + idx21], m1
- mova [stp + %3 + idx22], m2
- mova [stp + %3 + idx23], m3
-%endmacro
-
-INIT_XMM ssse3
-cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride
- mova m8, [pd_8192]
- mov r6, 4
- lea stp, [rsp + pass_one_start]
-
-idct32x32_1024:
- mov r3, inputq
- lea r4, [rsp + transposed_in]
- mov r7, 4
-
-idct32x32_1024_transpose:
-%if CONFIG_HIGHBITDEPTH
- mova m0, [r3 + 0]
- packssdw m0, [r3 + 16]
- mova m1, [r3 + 32 * 4]
- packssdw m1, [r3 + 32 * 4 + 16]
- mova m2, [r3 + 32 * 8]
- packssdw m2, [r3 + 32 * 8 + 16]
- mova m3, [r3 + 32 * 12]
- packssdw m3, [r3 + 32 * 12 + 16]
- mova m4, [r3 + 32 * 16]
- packssdw m4, [r3 + 32 * 16 + 16]
- mova m5, [r3 + 32 * 20]
- packssdw m5, [r3 + 32 * 20 + 16]
- mova m6, [r3 + 32 * 24]
- packssdw m6, [r3 + 32 * 24 + 16]
- mova m7, [r3 + 32 * 28]
- packssdw m7, [r3 + 32 * 28 + 16]
-%else
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 4]
- mova m2, [r3 + 16 * 8]
- mova m3, [r3 + 16 * 12]
- mova m4, [r3 + 16 * 16]
- mova m5, [r3 + 16 * 20]
- mova m6, [r3 + 16 * 24]
- mova m7, [r3 + 16 * 28]
-%endif
-
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- mova [r4 + 0], m0
- mova [r4 + 16 * 1], m1
- mova [r4 + 16 * 2], m2
- mova [r4 + 16 * 3], m3
- mova [r4 + 16 * 4], m4
- mova [r4 + 16 * 5], m5
- mova [r4 + 16 * 6], m6
- mova [r4 + 16 * 7], m7
-%if CONFIG_HIGHBITDEPTH
- add r3, 32
-%else
- add r3, 16
-%endif
- add r4, 16 * 8
- dec r7
- jne idct32x32_1024_transpose
-
- IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
-
- lea stp, [stp + 16 * 8]
-%if CONFIG_HIGHBITDEPTH
- lea inputq, [inputq + 32 * 32]
-%else
- lea inputq, [inputq + 16 * 32]
-%endif
- dec r6
- jnz idct32x32_1024
-
- mov r6, 4
- lea stp, [rsp + pass_one_start]
- lea r9, [rsp + pass_one_start]
-
-idct32x32_1024_2:
- lea r4, [rsp + transposed_in]
- mov r3, r9
- mov r7, 4
-
-idct32x32_1024_transpose_2:
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 1]
- mova m2, [r3 + 16 * 2]
- mova m3, [r3 + 16 * 3]
- mova m4, [r3 + 16 * 4]
- mova m5, [r3 + 16 * 5]
- mova m6, [r3 + 16 * 6]
- mova m7, [r3 + 16 * 7]
-
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- mova [r4 + 0], m0
- mova [r4 + 16 * 1], m1
- mova [r4 + 16 * 2], m2
- mova [r4 + 16 * 3], m3
- mova [r4 + 16 * 4], m4
- mova [r4 + 16 * 5], m5
- mova [r4 + 16 * 6], m6
- mova [r4 + 16 * 7], m7
-
- add r3, 16 * 8
- add r4, 16 * 8
- dec r7
- jne idct32x32_1024_transpose_2
-
- IDCT32X32_1024 16*0, 16*8, 16*16, 16*24
-
- lea stp, [stp + 16 * 32]
- add r9, 16 * 32
- dec r6
- jnz idct32x32_1024_2
-
- RECON_AND_STORE pass_two_start
-
- RET
-%endif
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 6c03487..949d90f 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -12,6 +12,7 @@
#include <math.h>
#include <stdlib.h>
#include <string.h>
+#include <limits>
#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -24,232 +25,488 @@
#include "av1/common/blockd.h"
#include "av1/common/scan.h"
#include "aom/aom_integer.h"
+#include "aom_ports/aom_timer.h"
using libaom_test::ACMRandom;
namespace {
typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, InvTxfmFunc, TX_SIZE, int>
+typedef void (*InvTxfmWithBdFunc)(const tran_low_t *in, uint8_t *out,
+ int stride, int bd);
+
+template <InvTxfmFunc fn>
+void wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) {
+ (void)bd;
+ fn(in, out, stride);
+}
+
+#if CONFIG_HIGHBITDEPTH
+template <InvTxfmWithBdFunc fn>
+void highbd_wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) {
+ fn(in, CONVERT_TO_BYTEPTR(out), stride, bd);
+}
+#endif
+
+typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmWithBdFunc, InvTxfmWithBdFunc,
+ TX_SIZE, int, int, int>
PartialInvTxfmParam;
const int kMaxNumCoeffs = 1024;
+const int kCountTestBlock = 1000;
+
class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
public:
virtual ~PartialIDctTest() {}
virtual void SetUp() {
+ rnd_.Reset(ACMRandom::DeterministicSeed());
ftxfm_ = GET_PARAM(0);
full_itxfm_ = GET_PARAM(1);
partial_itxfm_ = GET_PARAM(2);
tx_size_ = GET_PARAM(3);
last_nonzero_ = GET_PARAM(4);
+ bit_depth_ = GET_PARAM(5);
+ pixel_size_ = GET_PARAM(6);
+ mask_ = (1 << bit_depth_) - 1;
+
+ switch (tx_size_) {
+ case TX_4X4: size_ = 4; break;
+ case TX_8X8: size_ = 8; break;
+ case TX_16X16: size_ = 16; break;
+ case TX_32X32: size_ = 32; break;
+ default: FAIL() << "Wrong Size!"; break;
+ }
+
+ // Randomize stride_ to a value less than or equal to 1024
+ stride_ = rnd_(1024) + 1;
+ if (stride_ < size_) {
+ stride_ = size_;
+ }
+ // Align stride_ to 16 if it's bigger than 16.
+ if (stride_ > 16) {
+ stride_ &= ~15;
+ }
+
+ input_block_size_ = size_ * size_;
+ output_block_size_ = size_ * stride_;
+
+ input_block_ = reinterpret_cast<tran_low_t *>(
+ aom_memalign(16, sizeof(*input_block_) * input_block_size_));
+ output_block_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(16, pixel_size_ * output_block_size_));
+ output_block_ref_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(16, pixel_size_ * output_block_size_));
}
- virtual void TearDown() { libaom_test::ClearSystemState(); }
+ virtual void TearDown() {
+ aom_free(input_block_);
+ input_block_ = NULL;
+ aom_free(output_block_);
+ output_block_ = NULL;
+ aom_free(output_block_ref_);
+ output_block_ref_ = NULL;
+ libaom_test::ClearSystemState();
+ }
+
+ void InitMem() {
+ memset(input_block_, 0, sizeof(*input_block_) * input_block_size_);
+ if (pixel_size_ == 1) {
+ for (int j = 0; j < output_block_size_; ++j) {
+ output_block_[j] = output_block_ref_[j] = rnd_.Rand16() & mask_;
+ }
+ } else {
+ ASSERT_EQ(2, pixel_size_);
+ uint16_t *const output = reinterpret_cast<uint16_t *>(output_block_);
+ uint16_t *const output_ref =
+ reinterpret_cast<uint16_t *>(output_block_ref_);
+ for (int j = 0; j < output_block_size_; ++j) {
+ output[j] = output_ref[j] = rnd_.Rand16() & mask_;
+ }
+ }
+ }
+
+ void InitInput() {
+ const int max_coeff = 32766 / 4;
+ int max_energy_leftover = max_coeff * max_coeff;
+ for (int j = 0; j < last_nonzero_; ++j) {
+ int16_t coeff = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
+ (rnd_.Rand16() - 32768) / 65536);
+ max_energy_leftover -= coeff * coeff;
+ if (max_energy_leftover < 0) {
+ max_energy_leftover = 0;
+ coeff = 0;
+ }
+ input_block_[av1_default_scan_orders[tx_size_].scan[j]] = coeff;
+ }
+ }
protected:
int last_nonzero_;
TX_SIZE tx_size_;
+ tran_low_t *input_block_;
+ uint8_t *output_block_;
+ uint8_t *output_block_ref_;
+ int size_;
+ int stride_;
+ int pixel_size_;
+ int input_block_size_;
+ int output_block_size_;
+ int bit_depth_;
+ int mask_;
FwdTxfmFunc ftxfm_;
- InvTxfmFunc full_itxfm_;
- InvTxfmFunc partial_itxfm_;
+ InvTxfmWithBdFunc full_itxfm_;
+ InvTxfmWithBdFunc partial_itxfm_;
+ ACMRandom rnd_;
};
TEST_P(PartialIDctTest, RunQuantCheck) {
- int size;
- switch (tx_size_) {
- case TX_4X4: size = 4; break;
- case TX_8X8: size = 8; break;
- case TX_16X16: size = 16; break;
- case TX_32X32: size = 32; break;
- default: FAIL() << "Wrong Size!"; break;
- }
- DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
- DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
- DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
- DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
-
- const int count_test_block = 1000;
- const int block_size = size * size;
-
DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);
- int max_error = 0;
- for (int m = 0; m < count_test_block; ++m) {
- // clear out destination buffer
- memset(dst1, 0, sizeof(*dst1) * block_size);
- memset(dst2, 0, sizeof(*dst2) * block_size);
- memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
- memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
-
- ACMRandom rnd(ACMRandom::DeterministicSeed());
-
- for (int n = 0; n < count_test_block; ++n) {
- // Initialize a test block with input range [-255, 255].
- if (n == 0) {
- for (int j = 0; j < block_size; ++j) input_extreme_block[j] = 255;
- } else if (n == 1) {
- for (int j = 0; j < block_size; ++j) input_extreme_block[j] = -255;
- } else {
- for (int j = 0; j < block_size; ++j) {
- input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
- }
+ InitMem();
+ for (int i = 0; i < kCountTestBlock; ++i) {
+ // Initialize a test block with input range [-mask_, mask_].
+ if (i == 0) {
+ for (int k = 0; k < input_block_size_; ++k) {
+ input_extreme_block[k] = mask_;
}
-
- ftxfm_(input_extreme_block, output_ref_block, size);
-
- // quantization with maximum allowed step sizes
- test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336;
- for (int j = 1; j < last_nonzero_; ++j)
- test_coef_block1[av1_default_scan_orders[tx_size_].scan[j]] =
- (output_ref_block[j] / 1828) * 1828;
+ } else if (i == 1) {
+ for (int k = 0; k < input_block_size_; ++k) {
+ input_extreme_block[k] = -mask_;
+ }
+ } else {
+ for (int k = 0; k < input_block_size_; ++k) {
+ input_extreme_block[k] = rnd_.Rand8() % 2 ? mask_ : -mask_;
+ }
}
- ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
- ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size));
+ ftxfm_(input_extreme_block, output_ref_block, size_);
- for (int j = 0; j < block_size; ++j) {
- const int diff = dst1[j] - dst2[j];
- const int error = diff * diff;
- if (max_error < error) max_error = error;
+ // quantization with minimum allowed step sizes
+ input_block_[0] = (output_ref_block[0] / 4) * 4;
+ for (int k = 1; k < last_nonzero_; ++k) {
+ const int pos = av1_default_scan_orders[tx_size_].scan[k];
+ input_block_[pos] = (output_ref_block[pos] / 4) * 4;
}
+
+ ASM_REGISTER_STATE_CHECK(
+ full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+ ASM_REGISTER_STATE_CHECK(
+ partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
+ ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
+ pixel_size_ * output_block_size_))
+ << "Error: partial inverse transform produces different results";
}
-
- EXPECT_EQ(0, max_error)
- << "Error: partial inverse transform produces different results";
}
TEST_P(PartialIDctTest, ResultsMatch) {
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- int size;
- switch (tx_size_) {
- case TX_4X4: size = 4; break;
- case TX_8X8: size = 8; break;
- case TX_16X16: size = 16; break;
- case TX_32X32: size = 32; break;
- default: FAIL() << "Wrong Size!"; break;
+ for (int i = 0; i < kCountTestBlock; ++i) {
+ InitMem();
+ InitInput();
+
+ ASM_REGISTER_STATE_CHECK(
+ full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+ ASM_REGISTER_STATE_CHECK(
+ partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
+ ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
+ pixel_size_ * output_block_size_))
+ << "Error: partial inverse transform produces different results";
}
- DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
- DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
- DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
- DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
- const int count_test_block = 1000;
- const int max_coeff = 32766 / 4;
- const int block_size = size * size;
- int max_error = 0;
- for (int i = 0; i < count_test_block; ++i) {
- // clear out destination buffer
- memset(dst1, 0, sizeof(*dst1) * block_size);
- memset(dst2, 0, sizeof(*dst2) * block_size);
- memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
- memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
- int max_energy_leftover = max_coeff * max_coeff;
+}
+
+TEST_P(PartialIDctTest, AddOutputBlock) {
+ for (int i = 0; i < kCountTestBlock; ++i) {
+ InitMem();
for (int j = 0; j < last_nonzero_; ++j) {
- int16_t coef = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
- (rnd.Rand16() - 32768) / 65536);
- max_energy_leftover -= coef * coef;
- if (max_energy_leftover < 0) {
- max_energy_leftover = 0;
- coef = 0;
- }
- test_coef_block1[av1_default_scan_orders[tx_size_].scan[j]] = coef;
+ input_block_[av1_default_scan_orders[tx_size_].scan[j]] = 10;
}
- memcpy(test_coef_block2, test_coef_block1,
- sizeof(*test_coef_block2) * block_size);
+ ASM_REGISTER_STATE_CHECK(
+ full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+ ASM_REGISTER_STATE_CHECK(
+ partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
+ ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
+ pixel_size_ * output_block_size_))
+ << "Error: Transform results are not correctly added to output.";
+ }
+}
- ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
- ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block2, dst2, size));
+TEST_P(PartialIDctTest, SingleExtremeCoeff) {
+ const int16_t max_coeff = std::numeric_limits<int16_t>::max();
+ const int16_t min_coeff = std::numeric_limits<int16_t>::min();
+ for (int i = 0; i < last_nonzero_; ++i) {
+ memset(input_block_, 0, sizeof(*input_block_) * input_block_size_);
+ // Run once for min and once for max.
+ for (int j = 0; j < 2; ++j) {
+ const int coeff = j ? min_coeff : max_coeff;
- for (int j = 0; j < block_size; ++j) {
- const int diff = dst1[j] - dst2[j];
- const int error = diff * diff;
- if (max_error < error) max_error = error;
+ memset(output_block_, 0, pixel_size_ * output_block_size_);
+ memset(output_block_ref_, 0, pixel_size_ * output_block_size_);
+ input_block_[av1_default_scan_orders[tx_size_].scan[i]] = coeff;
+
+ ASM_REGISTER_STATE_CHECK(
+ full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+ ASM_REGISTER_STATE_CHECK(
+ partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
+ ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
+ pixel_size_ * output_block_size_))
+ << "Error: Fails with single coeff of " << coeff << " at " << i
+ << ".";
}
}
+}
- EXPECT_EQ(0, max_error)
+TEST_P(PartialIDctTest, DISABLED_Speed) {
+ // Keep runtime stable with transform size.
+ const int kCountSpeedTestBlock = 500000000 / input_block_size_;
+ InitMem();
+ InitInput();
+
+ for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+ ASM_REGISTER_STATE_CHECK(
+ full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+ }
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+ partial_itxfm_(input_block_, output_block_, stride_, bit_depth_);
+ }
+ libaom_test::ClearSystemState();
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time =
+ static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
+ printf("idct%dx%d_%d (bitdepth %d) time: %5d ms\n", size_, size_,
+ last_nonzero_, bit_depth_, elapsed_time);
+
+ ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
+ pixel_size_ * output_block_size_))
<< "Error: partial inverse transform produces different results";
}
+
using std::tr1::make_tuple;
-INSTANTIATE_TEST_CASE_P(
- C, PartialIDctTest,
- ::testing::Values(make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c,
- &aom_idct32x32_34_add_c, TX_32X32, 34),
- make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c,
- &aom_idct32x32_1_add_c, TX_32X32, 1),
- make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c,
- &aom_idct16x16_10_add_c, TX_16X16, 10),
- make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c,
- &aom_idct16x16_1_add_c, TX_16X16, 1),
- make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c,
- &aom_idct8x8_12_add_c, TX_8X8, 12),
- make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c,
- &aom_idct8x8_1_add_c, TX_8X8, 1),
- make_tuple(&aom_fdct4x4_c, &aom_idct4x4_16_add_c,
- &aom_idct4x4_1_add_c, TX_4X4, 1)));
+const PartialInvTxfmParam c_partial_idct_tests[] = {
+#if CONFIG_HIGHBITDEPTH
+ make_tuple(
+ &aom_highbd_fdct16x16_c, &highbd_wrapper<aom_highbd_idct16x16_256_add_c>,
+ &highbd_wrapper<aom_highbd_idct16x16_10_add_c>, TX_16X16, 10, 8, 2),
+ make_tuple(
+ &aom_highbd_fdct16x16_c, &highbd_wrapper<aom_highbd_idct16x16_256_add_c>,
+ &highbd_wrapper<aom_highbd_idct16x16_10_add_c>, TX_16X16, 10, 10, 2),
+ make_tuple(
+ &aom_highbd_fdct16x16_c, &highbd_wrapper<aom_highbd_idct16x16_256_add_c>,
+ &highbd_wrapper<aom_highbd_idct16x16_10_add_c>, TX_16X16, 10, 12, 2),
+ make_tuple(&aom_highbd_fdct4x4_c,
+ &highbd_wrapper<aom_highbd_idct4x4_16_add_c>,
+ &highbd_wrapper<aom_highbd_idct4x4_16_add_c>, TX_4X4, 16, 8, 2),
+ make_tuple(&aom_highbd_fdct4x4_c,
+ &highbd_wrapper<aom_highbd_idct4x4_16_add_c>,
+ &highbd_wrapper<aom_highbd_idct4x4_16_add_c>, TX_4X4, 16, 10, 2),
+ make_tuple(&aom_highbd_fdct4x4_c,
+ &highbd_wrapper<aom_highbd_idct4x4_16_add_c>,
+ &highbd_wrapper<aom_highbd_idct4x4_16_add_c>, TX_4X4, 16, 12, 2),
+ make_tuple(&aom_highbd_fdct4x4_c,
+ &highbd_wrapper<aom_highbd_idct4x4_16_add_c>,
+ &highbd_wrapper<aom_highbd_idct4x4_1_add_c>, TX_4X4, 1, 8, 2),
+ make_tuple(&aom_highbd_fdct4x4_c,
+ &highbd_wrapper<aom_highbd_idct4x4_16_add_c>,
+ &highbd_wrapper<aom_highbd_idct4x4_1_add_c>, TX_4X4, 1, 10, 2),
+ make_tuple(&aom_highbd_fdct4x4_c,
+ &highbd_wrapper<aom_highbd_idct4x4_16_add_c>,
+ &highbd_wrapper<aom_highbd_idct4x4_1_add_c>, TX_4X4, 1, 12, 2),
+#endif // CONFIG_HIGHBITDEPTH
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_1024_add_c>, TX_32X32, 1024, 8, 1),
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_135_add_c>, TX_32X32, 135, 8, 1),
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_34_add_c>, TX_32X32, 34, 8, 1),
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_1_add_c>, TX_32X32, 1, 8, 1),
+ make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
+ &wrapper<aom_idct16x16_256_add_c>, TX_16X16, 256, 8, 1),
+ make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
+ &wrapper<aom_idct16x16_38_add_c>, TX_16X16, 38, 8, 1),
+ make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
+ &wrapper<aom_idct16x16_10_add_c>, TX_16X16, 10, 8, 1),
+ make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
+ &wrapper<aom_idct16x16_1_add_c>, TX_16X16, 1, 8, 1),
+ make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
+ &wrapper<aom_idct8x8_64_add_c>, TX_8X8, 64, 8, 1),
+ make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
+ &wrapper<aom_idct8x8_12_add_c>, TX_8X8, 12, 8, 1),
+ make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
+ &wrapper<aom_idct8x8_1_add_c>, TX_8X8, 1, 8, 1),
+ make_tuple(&aom_fdct4x4_c, &wrapper<aom_idct4x4_16_add_c>,
+ &wrapper<aom_idct4x4_16_add_c>, TX_4X4, 16, 8, 1),
+ make_tuple(&aom_fdct4x4_c, &wrapper<aom_idct4x4_16_add_c>,
+ &wrapper<aom_idct4x4_1_add_c>, TX_4X4, 1, 8, 1)
+};
+
+INSTANTIATE_TEST_CASE_P(C, PartialIDctTest,
+ ::testing::ValuesIn(c_partial_idct_tests));
#if HAVE_NEON && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
- NEON, PartialIDctTest,
- ::testing::Values(make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c,
- &aom_idct32x32_1_add_neon, TX_32X32, 1),
- make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c,
- &aom_idct16x16_10_add_neon, TX_16X16, 10),
- make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c,
- &aom_idct16x16_1_add_neon, TX_16X16, 1),
- make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c,
- &aom_idct8x8_12_add_neon, TX_8X8, 12),
- make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c,
- &aom_idct8x8_1_add_neon, TX_8X8, 1),
- make_tuple(&aom_fdct4x4_c, &aom_idct4x4_16_add_c,
- &aom_idct4x4_1_add_neon, TX_4X4, 1)));
+const PartialInvTxfmParam neon_partial_idct_tests[] = {
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_1_add_neon>, TX_32X32, 1, 8, 1),
+ make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
+ &wrapper<aom_idct16x16_10_add_neon>, TX_16X16, 10, 8, 1),
+ make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
+ &wrapper<aom_idct16x16_1_add_neon>, TX_16X16, 1, 8, 1),
+ make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
+ &wrapper<aom_idct8x8_12_add_neon>, TX_8X8, 12, 8, 1),
+ make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
+ &wrapper<aom_idct8x8_1_add_neon>, TX_8X8, 1, 8, 1),
+ make_tuple(&aom_fdct4x4_c, &wrapper<aom_idct4x4_16_add_c>,
+ &wrapper<aom_idct4x4_1_add_neon>, TX_4X4, 1, 8, 1)
+};
+
+INSTANTIATE_TEST_CASE_P(NEON, PartialIDctTest,
+ ::testing::ValuesIn(neon_partial_idct_tests));
#endif // HAVE_NEON && !CONFIG_HIGHBITDEPTH
-#if HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
- SSE2, PartialIDctTest,
- ::testing::Values(make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c,
- &aom_idct32x32_34_add_sse2, TX_32X32, 34),
- make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c,
- &aom_idct32x32_1_add_sse2, TX_32X32, 1),
- make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c,
- &aom_idct16x16_10_add_sse2, TX_16X16, 10),
- make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c,
- &aom_idct16x16_1_add_sse2, TX_16X16, 1),
- make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c,
- &aom_idct8x8_12_add_sse2, TX_8X8, 12),
- make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c,
- &aom_idct8x8_1_add_sse2, TX_8X8, 1),
- make_tuple(&aom_fdct4x4_c, &aom_idct4x4_16_add_c,
- &aom_idct4x4_1_add_sse2, TX_4X4, 1)));
-#endif
+#if HAVE_SSE2
+const PartialInvTxfmParam sse2_partial_idct_tests[] = {
+#if CONFIG_HIGHBITDEPTH
+ make_tuple(
+ &aom_highbd_fdct16x16_c, &highbd_wrapper<aom_highbd_idct16x16_256_add_c>,
+ &highbd_wrapper<aom_highbd_idct16x16_256_add_sse2>, TX_16X16, 256, 8, 2),
+ make_tuple(
+ &aom_highbd_fdct16x16_c, &highbd_wrapper<aom_highbd_idct16x16_256_add_c>,
+ &highbd_wrapper<aom_highbd_idct16x16_256_add_sse2>, TX_16X16, 256, 10, 2),
+ make_tuple(
+ &aom_highbd_fdct16x16_c, &highbd_wrapper<aom_highbd_idct16x16_256_add_c>,
+ &highbd_wrapper<aom_highbd_idct16x16_256_add_sse2>, TX_16X16, 256, 12, 2),
+ make_tuple(
+ &aom_highbd_fdct16x16_c, &highbd_wrapper<aom_highbd_idct16x16_256_add_c>,
+ &highbd_wrapper<aom_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 8, 2),
+ make_tuple(
+ &aom_highbd_fdct16x16_c, &highbd_wrapper<aom_highbd_idct16x16_256_add_c>,
+ &highbd_wrapper<aom_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 10, 2),
+ make_tuple(
+ &aom_highbd_fdct16x16_c, &highbd_wrapper<aom_highbd_idct16x16_256_add_c>,
+ &highbd_wrapper<aom_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 12, 2),
+ make_tuple(&aom_highbd_fdct4x4_c,
+ &highbd_wrapper<aom_highbd_idct4x4_16_add_c>,
+ &highbd_wrapper<aom_highbd_idct4x4_16_add_sse2>, TX_4X4, 16, 8, 2),
+ make_tuple(
+ &aom_highbd_fdct4x4_c, &highbd_wrapper<aom_highbd_idct4x4_16_add_c>,
+ &highbd_wrapper<aom_highbd_idct4x4_16_add_sse2>, TX_4X4, 16, 10, 2),
+ make_tuple(
+ &aom_highbd_fdct4x4_c, &highbd_wrapper<aom_highbd_idct4x4_16_add_c>,
+ &highbd_wrapper<aom_highbd_idct4x4_16_add_sse2>, TX_4X4, 16, 12, 2),
+#endif // CONFIG_HIGHBITDEPTH
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_1024_add_sse2>, TX_32X32, 1024, 8, 1),
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_1024_add_sse2>, TX_32X32, 135, 8, 1),
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_34_add_sse2>, TX_32X32, 34, 8, 1),
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_1_add_sse2>, TX_32X32, 1, 8, 1),
+ make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
+ &wrapper<aom_idct16x16_256_add_sse2>, TX_16X16, 256, 8, 1),
+ make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
+ &wrapper<aom_idct16x16_10_add_sse2>, TX_16X16, 10, 8, 1),
+ make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
+ &wrapper<aom_idct16x16_1_add_sse2>, TX_16X16, 1, 8, 1),
+ make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
+ &wrapper<aom_idct8x8_64_add_sse2>, TX_8X8, 64, 8, 1),
+ make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
+ &wrapper<aom_idct8x8_12_add_sse2>, TX_8X8, 12, 8, 1),
+ make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
+ &wrapper<aom_idct8x8_1_add_sse2>, TX_8X8, 1, 8, 1),
+ make_tuple(&aom_fdct4x4_c, &wrapper<aom_idct4x4_16_add_c>,
+ &wrapper<aom_idct4x4_16_add_sse2>, TX_4X4, 16, 8, 1),
+ make_tuple(&aom_fdct4x4_c, &wrapper<aom_idct4x4_16_add_c>,
+ &wrapper<aom_idct4x4_1_add_sse2>, TX_4X4, 1, 8, 1)
+};
-#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
- SSSE3_64, PartialIDctTest,
- ::testing::Values(make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c,
- &aom_idct8x8_12_add_ssse3, TX_8X8, 12)));
-#endif
+INSTANTIATE_TEST_CASE_P(SSE2, PartialIDctTest,
+ ::testing::ValuesIn(sse2_partial_idct_tests));
+
+#endif // HAVE_SSE2
+
+#if HAVE_SSSE3
+const PartialInvTxfmParam ssse3_partial_idct_tests[] = {
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_1024_add_ssse3>, TX_32X32, 1024, 8, 1),
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_135_add_ssse3>, TX_32X32, 135, 8, 1),
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_34_add_ssse3>, TX_32X32, 34, 8, 1),
+ make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
+ &wrapper<aom_idct8x8_64_add_ssse3>, TX_8X8, 64, 8, 1),
+ make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
+ &wrapper<aom_idct8x8_12_add_ssse3>, TX_8X8, 12, 8, 1)
+};
+
+INSTANTIATE_TEST_CASE_P(SSSE3, PartialIDctTest,
+ ::testing::ValuesIn(ssse3_partial_idct_tests));
+#endif // HAVE_SSSE3
+
+#if HAVE_DSPR2 && !CONFIG_HIGHBITDEPTH
+const PartialInvTxfmParam dspr2_partial_idct_tests[] = {
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_1024_add_dspr2>, TX_32X32, 1024, 8, 1),
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_1024_add_dspr2>, TX_32X32, 135, 8, 1),
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_34_add_dspr2>, TX_32X32, 34, 8, 1),
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_1_add_dspr2>, TX_32X32, 1, 8, 1),
+ make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
+ &wrapper<aom_idct16x16_256_add_dspr2>, TX_16X16, 256, 8, 1),
+ make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
+ &wrapper<aom_idct16x16_10_add_dspr2>, TX_16X16, 10, 8, 1),
+ make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
+ &wrapper<aom_idct16x16_1_add_dspr2>, TX_16X16, 1, 8, 1),
+ make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
+ &wrapper<aom_idct8x8_64_add_dspr2>, TX_8X8, 64, 8, 1),
+ make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
+ &wrapper<aom_idct8x8_12_add_dspr2>, TX_8X8, 12, 8, 1),
+ make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
+ &wrapper<aom_idct8x8_1_add_dspr2>, TX_8X8, 1, 8, 1),
+ make_tuple(&aom_fdct4x4_c, &wrapper<aom_idct4x4_16_add_c>,
+ &wrapper<aom_idct4x4_16_add_dspr2>, TX_4X4, 16, 8, 1),
+ make_tuple(&aom_fdct4x4_c, &wrapper<aom_idct4x4_16_add_c>,
+ &wrapper<aom_idct4x4_1_add_dspr2>, TX_4X4, 1, 8, 1)
+};
+
+INSTANTIATE_TEST_CASE_P(DSPR2, PartialIDctTest,
+ ::testing::ValuesIn(dspr2_partial_idct_tests));
+#endif // HAVE_DSPR2 && !CONFIG_HIGHBITDEPTH
#if HAVE_MSA && !CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
- MSA, PartialIDctTest,
- ::testing::Values(make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c,
- &aom_idct32x32_34_add_msa, TX_32X32, 34),
- make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c,
- &aom_idct32x32_1_add_msa, TX_32X32, 1),
- make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c,
- &aom_idct16x16_10_add_msa, TX_16X16, 10),
- make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c,
- &aom_idct16x16_1_add_msa, TX_16X16, 1),
- make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c,
- &aom_idct8x8_12_add_msa, TX_8X8, 10),
- make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c,
- &aom_idct8x8_1_add_msa, TX_8X8, 1),
- make_tuple(&aom_fdct4x4_c, &aom_idct4x4_16_add_c,
- &aom_idct4x4_1_add_msa, TX_4X4, 1)));
+const PartialInvTxfmParam msa_partial_idct_tests[] = {
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_1024_add_msa>, TX_32X32, 1024, 8, 1),
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_1024_add_msa>, TX_32X32, 135, 8, 1),
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_34_add_msa>, TX_32X32, 34, 8, 1),
+ make_tuple(&aom_fdct32x32_c, &wrapper<aom_idct32x32_1024_add_c>,
+ &wrapper<aom_idct32x32_1_add_msa>, TX_32X32, 1, 8, 1),
+ make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
+ &wrapper<aom_idct16x16_256_add_msa>, TX_16X16, 256, 8, 1),
+ make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
+ &wrapper<aom_idct16x16_10_add_msa>, TX_16X16, 10, 8, 1),
+ make_tuple(&aom_fdct16x16_c, &wrapper<aom_idct16x16_256_add_c>,
+ &wrapper<aom_idct16x16_1_add_msa>, TX_16X16, 1, 8, 1),
+ make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
+ &wrapper<aom_idct8x8_64_add_msa>, TX_8X8, 64, 8, 1),
+ make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
+ &wrapper<aom_idct8x8_12_add_msa>, TX_8X8, 12, 8, 1),
+ make_tuple(&aom_fdct8x8_c, &wrapper<aom_idct8x8_64_add_c>,
+ &wrapper<aom_idct8x8_1_add_msa>, TX_8X8, 1, 8, 1),
+ make_tuple(&aom_fdct4x4_c, &wrapper<aom_idct4x4_16_add_c>,
+ &wrapper<aom_idct4x4_16_add_msa>, TX_4X4, 16, 8, 1),
+ make_tuple(&aom_fdct4x4_c, &wrapper<aom_idct4x4_16_add_c>,
+ &wrapper<aom_idct4x4_1_add_msa>, TX_4X4, 1, 8, 1)
+};
+
+INSTANTIATE_TEST_CASE_P(MSA, PartialIDctTest,
+ ::testing::ValuesIn(msa_partial_idct_tests));
#endif // HAVE_MSA && !CONFIG_HIGHBITDEPTH
} // namespace