Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 1 | /* |
Yaowu Xu | 9c01aa1 | 2016-09-01 14:32:49 -0700 | [diff] [blame] | 2 | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 3 | * |
Yaowu Xu | 9c01aa1 | 2016-09-01 14:32:49 -0700 | [diff] [blame] | 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 10 | */ |
| 11 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 12 | #ifndef AOM_DSP_X86_INV_TXFM_SSE2_H_ |
| 13 | #define AOM_DSP_X86_INV_TXFM_SSE2_H_ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 14 | |
| 15 | #include <emmintrin.h> // SSE2 |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 16 | #include "./aom_config.h" |
| 17 | #include "aom/aom_integer.h" |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 18 | #include "aom_dsp/inv_txfm.h" |
| 19 | #include "aom_dsp/x86/txfm_common_sse2.h" |
| 20 | |
| 21 | // perform 8x8 transpose |
David Barker | 4d03d6f | 2016-10-03 16:27:27 +0100 | [diff] [blame] | 22 | static INLINE void array_transpose_4x4(__m128i *res) { |
| 23 | const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); |
| 24 | const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); |
| 25 | |
| 26 | res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); |
| 27 | res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); |
| 28 | } |
| 29 | |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 30 | static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { |
| 31 | const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); |
| 32 | const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); |
| 33 | const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); |
| 34 | const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); |
| 35 | const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); |
| 36 | const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); |
| 37 | const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); |
| 38 | const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); |
| 39 | |
| 40 | const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); |
| 41 | const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); |
| 42 | const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); |
| 43 | const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); |
| 44 | const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); |
| 45 | const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); |
| 46 | const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); |
| 47 | const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); |
| 48 | |
| 49 | res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); |
| 50 | res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); |
| 51 | res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); |
| 52 | res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); |
| 53 | res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); |
| 54 | res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); |
| 55 | res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); |
| 56 | res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); |
| 57 | } |
| 58 | |
| 59 | #define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ |
| 60 | { \ |
| 61 | const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ |
| 62 | const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ |
| 63 | \ |
| 64 | in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ |
| 65 | in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ |
| 66 | } |
| 67 | |
| 68 | static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) { |
| 69 | const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); |
| 70 | const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); |
| 71 | const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); |
| 72 | const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); |
| 73 | |
| 74 | const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); |
| 75 | const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); |
| 76 | const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); |
| 77 | const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); |
| 78 | |
| 79 | out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); |
| 80 | out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); |
| 81 | out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); |
| 82 | out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); |
| 83 | } |
| 84 | |
| 85 | static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { |
| 86 | __m128i tbuf[8]; |
| 87 | array_transpose_8x8(res0, res0); |
| 88 | array_transpose_8x8(res1, tbuf); |
| 89 | array_transpose_8x8(res0 + 8, res1); |
| 90 | array_transpose_8x8(res1 + 8, res1 + 8); |
| 91 | |
| 92 | res0[8] = tbuf[0]; |
| 93 | res0[9] = tbuf[1]; |
| 94 | res0[10] = tbuf[2]; |
| 95 | res0[11] = tbuf[3]; |
| 96 | res0[12] = tbuf[4]; |
| 97 | res0[13] = tbuf[5]; |
| 98 | res0[14] = tbuf[6]; |
| 99 | res0[15] = tbuf[7]; |
| 100 | } |
| 101 | |
| 102 | // Function to allow 8 bit optimisations to be used when profile 0 is used with |
| 103 | // highbitdepth enabled |
| 104 | static INLINE __m128i load_input_data(const tran_low_t *data) { |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 105 | #if CONFIG_AOM_HIGHBITDEPTH |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 106 | return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], |
| 107 | data[6], data[7]); |
| 108 | #else |
| 109 | return _mm_load_si128((const __m128i *)data); |
| 110 | #endif |
| 111 | } |
| 112 | |
| 113 | static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) { |
| 114 | in[0] = load_input_data(input + 0 * 16); |
| 115 | in[1] = load_input_data(input + 1 * 16); |
| 116 | in[2] = load_input_data(input + 2 * 16); |
| 117 | in[3] = load_input_data(input + 3 * 16); |
| 118 | in[4] = load_input_data(input + 4 * 16); |
| 119 | in[5] = load_input_data(input + 5 * 16); |
| 120 | in[6] = load_input_data(input + 6 * 16); |
| 121 | in[7] = load_input_data(input + 7 * 16); |
| 122 | |
| 123 | in[8] = load_input_data(input + 8 * 16); |
| 124 | in[9] = load_input_data(input + 9 * 16); |
| 125 | in[10] = load_input_data(input + 10 * 16); |
| 126 | in[11] = load_input_data(input + 11 * 16); |
| 127 | in[12] = load_input_data(input + 12 * 16); |
| 128 | in[13] = load_input_data(input + 13 * 16); |
| 129 | in[14] = load_input_data(input + 14 * 16); |
| 130 | in[15] = load_input_data(input + 15 * 16); |
| 131 | } |
| 132 | |
| 133 | #define RECON_AND_STORE(dest, in_x) \ |
| 134 | { \ |
| 135 | __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ |
| 136 | d0 = _mm_unpacklo_epi8(d0, zero); \ |
| 137 | d0 = _mm_add_epi16(in_x, d0); \ |
| 138 | d0 = _mm_packus_epi16(d0, d0); \ |
| 139 | _mm_storel_epi64((__m128i *)(dest), d0); \ |
| 140 | } |
| 141 | |
| 142 | static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { |
| 143 | const __m128i final_rounding = _mm_set1_epi16(1 << 5); |
| 144 | const __m128i zero = _mm_setzero_si128(); |
| 145 | // Final rounding and shift |
| 146 | in[0] = _mm_adds_epi16(in[0], final_rounding); |
| 147 | in[1] = _mm_adds_epi16(in[1], final_rounding); |
| 148 | in[2] = _mm_adds_epi16(in[2], final_rounding); |
| 149 | in[3] = _mm_adds_epi16(in[3], final_rounding); |
| 150 | in[4] = _mm_adds_epi16(in[4], final_rounding); |
| 151 | in[5] = _mm_adds_epi16(in[5], final_rounding); |
| 152 | in[6] = _mm_adds_epi16(in[6], final_rounding); |
| 153 | in[7] = _mm_adds_epi16(in[7], final_rounding); |
| 154 | in[8] = _mm_adds_epi16(in[8], final_rounding); |
| 155 | in[9] = _mm_adds_epi16(in[9], final_rounding); |
| 156 | in[10] = _mm_adds_epi16(in[10], final_rounding); |
| 157 | in[11] = _mm_adds_epi16(in[11], final_rounding); |
| 158 | in[12] = _mm_adds_epi16(in[12], final_rounding); |
| 159 | in[13] = _mm_adds_epi16(in[13], final_rounding); |
| 160 | in[14] = _mm_adds_epi16(in[14], final_rounding); |
| 161 | in[15] = _mm_adds_epi16(in[15], final_rounding); |
| 162 | |
| 163 | in[0] = _mm_srai_epi16(in[0], 6); |
| 164 | in[1] = _mm_srai_epi16(in[1], 6); |
| 165 | in[2] = _mm_srai_epi16(in[2], 6); |
| 166 | in[3] = _mm_srai_epi16(in[3], 6); |
| 167 | in[4] = _mm_srai_epi16(in[4], 6); |
| 168 | in[5] = _mm_srai_epi16(in[5], 6); |
| 169 | in[6] = _mm_srai_epi16(in[6], 6); |
| 170 | in[7] = _mm_srai_epi16(in[7], 6); |
| 171 | in[8] = _mm_srai_epi16(in[8], 6); |
| 172 | in[9] = _mm_srai_epi16(in[9], 6); |
| 173 | in[10] = _mm_srai_epi16(in[10], 6); |
| 174 | in[11] = _mm_srai_epi16(in[11], 6); |
| 175 | in[12] = _mm_srai_epi16(in[12], 6); |
| 176 | in[13] = _mm_srai_epi16(in[13], 6); |
| 177 | in[14] = _mm_srai_epi16(in[14], 6); |
| 178 | in[15] = _mm_srai_epi16(in[15], 6); |
| 179 | |
| 180 | RECON_AND_STORE(dest + 0 * stride, in[0]); |
| 181 | RECON_AND_STORE(dest + 1 * stride, in[1]); |
| 182 | RECON_AND_STORE(dest + 2 * stride, in[2]); |
| 183 | RECON_AND_STORE(dest + 3 * stride, in[3]); |
| 184 | RECON_AND_STORE(dest + 4 * stride, in[4]); |
| 185 | RECON_AND_STORE(dest + 5 * stride, in[5]); |
| 186 | RECON_AND_STORE(dest + 6 * stride, in[6]); |
| 187 | RECON_AND_STORE(dest + 7 * stride, in[7]); |
| 188 | RECON_AND_STORE(dest + 8 * stride, in[8]); |
| 189 | RECON_AND_STORE(dest + 9 * stride, in[9]); |
| 190 | RECON_AND_STORE(dest + 10 * stride, in[10]); |
| 191 | RECON_AND_STORE(dest + 11 * stride, in[11]); |
| 192 | RECON_AND_STORE(dest + 12 * stride, in[12]); |
| 193 | RECON_AND_STORE(dest + 13 * stride, in[13]); |
| 194 | RECON_AND_STORE(dest + 14 * stride, in[14]); |
| 195 | RECON_AND_STORE(dest + 15 * stride, in[15]); |
| 196 | } |
| 197 | |
Peter de Rivaz | 1baecfe | 2016-09-29 09:14:54 +0100 | [diff] [blame] | 198 | void iadst16_8col(__m128i *in); |
| 199 | void idct16_8col(__m128i *in); |
Luca Barbato | f0f9857 | 2016-09-03 12:14:15 +0200 | [diff] [blame] | 200 | void aom_idct4_sse2(__m128i *in); |
| 201 | void aom_idct8_sse2(__m128i *in); |
| 202 | void aom_idct16_sse2(__m128i *in0, __m128i *in1); |
| 203 | void aom_iadst4_sse2(__m128i *in); |
| 204 | void aom_iadst8_sse2(__m128i *in); |
| 205 | void aom_iadst16_sse2(__m128i *in0, __m128i *in1); |
David Barker | 33231d4 | 2016-10-06 17:25:40 +0100 | [diff] [blame] | 206 | void idct32_8col(__m128i *in0, __m128i *in1); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 207 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 208 | #endif // AOM_DSP_X86_INV_TXFM_SSE2_H_ |