Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 1 | /* |
Adrian Grange | a872b06 | 2016-03-24 11:38:32 -0700 | [diff] [blame] | 2 | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 3 | * |
Adrian Grange | a872b06 | 2016-03-24 11:38:32 -0700 | [diff] [blame] | 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 10 | */ |
| 11 | |
Urvang Joshi | 09eea21 | 2016-07-14 11:40:38 -0700 | [diff] [blame] | 12 | #include <assert.h> |
Yaowu Xu | bf4202e | 2016-03-21 15:15:19 -0700 | [diff] [blame] | 13 | #include "aom_dsp/fwd_txfm.h" |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 14 | |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 15 | void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 16 | // The 2D transform is done with two passes which are actually pretty |
| 17 | // similar. In the first one, we transform the columns and transpose |
| 18 | // the results. In the second one, we transform the rows. To achieve that, |
| 19 | // as the first pass results are transposed, we transpose the columns (that |
| 20 | // is the transposed rows) and transpose the results (so that it goes back |
| 21 | // in normal/row positions). |
| 22 | int pass; |
| 23 | // We need an intermediate buffer between passes. |
| 24 | tran_low_t intermediate[4 * 4]; |
Urvang Joshi | 09eea21 | 2016-07-14 11:40:38 -0700 | [diff] [blame] | 25 | const tran_low_t *in_low = NULL; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 26 | tran_low_t *out = intermediate; |
| 27 | // Do the two transform/transpose passes |
| 28 | for (pass = 0; pass < 2; ++pass) { |
Urvang Joshi | 09eea21 | 2016-07-14 11:40:38 -0700 | [diff] [blame] | 29 | tran_high_t in_high[4]; // canbe16 |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 30 | tran_high_t step[4]; // canbe16 |
| 31 | tran_high_t temp1, temp2; // needs32 |
| 32 | int i; |
| 33 | for (i = 0; i < 4; ++i) { |
| 34 | // Load inputs. |
Urvang Joshi | 09eea21 | 2016-07-14 11:40:38 -0700 | [diff] [blame] | 35 | if (pass == 0) { |
| 36 | in_high[0] = input[0 * stride] * 16; |
| 37 | in_high[1] = input[1 * stride] * 16; |
| 38 | in_high[2] = input[2 * stride] * 16; |
| 39 | in_high[3] = input[3 * stride] * 16; |
| 40 | if (i == 0 && in_high[0]) { |
| 41 | ++in_high[0]; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 42 | } |
| 43 | } else { |
Urvang Joshi | 09eea21 | 2016-07-14 11:40:38 -0700 | [diff] [blame] | 44 | assert(in_low != NULL); |
| 45 | in_high[0] = in_low[0 * 4]; |
| 46 | in_high[1] = in_low[1 * 4]; |
| 47 | in_high[2] = in_low[2 * 4]; |
| 48 | in_high[3] = in_low[3 * 4]; |
| 49 | ++in_low; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 50 | } |
| 51 | // Transform. |
Urvang Joshi | 09eea21 | 2016-07-14 11:40:38 -0700 | [diff] [blame] | 52 | step[0] = in_high[0] + in_high[3]; |
| 53 | step[1] = in_high[1] + in_high[2]; |
| 54 | step[2] = in_high[1] - in_high[2]; |
| 55 | step[3] = in_high[0] - in_high[3]; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 56 | temp1 = (step[0] + step[1]) * cospi_16_64; |
| 57 | temp2 = (step[0] - step[1]) * cospi_16_64; |
| 58 | out[0] = (tran_low_t)fdct_round_shift(temp1); |
| 59 | out[2] = (tran_low_t)fdct_round_shift(temp2); |
| 60 | temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; |
| 61 | temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; |
| 62 | out[1] = (tran_low_t)fdct_round_shift(temp1); |
| 63 | out[3] = (tran_low_t)fdct_round_shift(temp2); |
| 64 | // Do next column (which is a transposed row in second/horizontal pass) |
Urvang Joshi | 09eea21 | 2016-07-14 11:40:38 -0700 | [diff] [blame] | 65 | ++input; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 66 | out += 4; |
| 67 | } |
| 68 | // Setup in/out for next pass. |
Urvang Joshi | 09eea21 | 2016-07-14 11:40:38 -0700 | [diff] [blame] | 69 | in_low = intermediate; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 70 | out = output; |
| 71 | } |
| 72 | |
| 73 | { |
| 74 | int i, j; |
| 75 | for (i = 0; i < 4; ++i) { |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 76 | for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 77 | } |
| 78 | } |
| 79 | } |
| 80 | |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 81 | void aom_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { |
Jingning Han | d19033f | 2015-07-28 14:42:25 -0700 | [diff] [blame] | 82 | int r, c; |
| 83 | tran_low_t sum = 0; |
| 84 | for (r = 0; r < 4; ++r) |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 85 | for (c = 0; c < 4; ++c) sum += input[r * stride + c]; |
Jingning Han | d19033f | 2015-07-28 14:42:25 -0700 | [diff] [blame] | 86 | |
| 87 | output[0] = sum << 1; |
| 88 | output[1] = 0; |
| 89 | } |
| 90 | |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 91 | void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 92 | int i, j; |
| 93 | tran_low_t intermediate[64]; |
| 94 | int pass; |
| 95 | tran_low_t *output = intermediate; |
| 96 | const tran_low_t *in = NULL; |
| 97 | |
| 98 | // Transform columns |
| 99 | for (pass = 0; pass < 2; ++pass) { |
| 100 | tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 |
| 101 | tran_high_t t0, t1, t2, t3; // needs32 |
| 102 | tran_high_t x0, x1, x2, x3; // canbe16 |
| 103 | |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 104 | for (i = 0; i < 8; i++) { |
| 105 | // stage 1 |
| 106 | if (pass == 0) { |
| 107 | s0 = (input[0 * stride] + input[7 * stride]) * 4; |
| 108 | s1 = (input[1 * stride] + input[6 * stride]) * 4; |
| 109 | s2 = (input[2 * stride] + input[5 * stride]) * 4; |
| 110 | s3 = (input[3 * stride] + input[4 * stride]) * 4; |
| 111 | s4 = (input[3 * stride] - input[4 * stride]) * 4; |
| 112 | s5 = (input[2 * stride] - input[5 * stride]) * 4; |
| 113 | s6 = (input[1 * stride] - input[6 * stride]) * 4; |
| 114 | s7 = (input[0 * stride] - input[7 * stride]) * 4; |
| 115 | ++input; |
| 116 | } else { |
| 117 | s0 = in[0 * 8] + in[7 * 8]; |
| 118 | s1 = in[1 * 8] + in[6 * 8]; |
| 119 | s2 = in[2 * 8] + in[5 * 8]; |
| 120 | s3 = in[3 * 8] + in[4 * 8]; |
| 121 | s4 = in[3 * 8] - in[4 * 8]; |
| 122 | s5 = in[2 * 8] - in[5 * 8]; |
| 123 | s6 = in[1 * 8] - in[6 * 8]; |
| 124 | s7 = in[0 * 8] - in[7 * 8]; |
| 125 | ++in; |
| 126 | } |
| 127 | |
| 128 | // fdct4(step, step); |
| 129 | x0 = s0 + s3; |
| 130 | x1 = s1 + s2; |
| 131 | x2 = s1 - s2; |
| 132 | x3 = s0 - s3; |
| 133 | t0 = (x0 + x1) * cospi_16_64; |
| 134 | t1 = (x0 - x1) * cospi_16_64; |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 135 | t2 = x2 * cospi_24_64 + x3 * cospi_8_64; |
| 136 | t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 137 | output[0] = (tran_low_t)fdct_round_shift(t0); |
| 138 | output[2] = (tran_low_t)fdct_round_shift(t2); |
| 139 | output[4] = (tran_low_t)fdct_round_shift(t1); |
| 140 | output[6] = (tran_low_t)fdct_round_shift(t3); |
| 141 | |
| 142 | // Stage 2 |
| 143 | t0 = (s6 - s5) * cospi_16_64; |
| 144 | t1 = (s6 + s5) * cospi_16_64; |
| 145 | t2 = fdct_round_shift(t0); |
| 146 | t3 = fdct_round_shift(t1); |
| 147 | |
| 148 | // Stage 3 |
| 149 | x0 = s4 + t2; |
| 150 | x1 = s4 - t2; |
| 151 | x2 = s7 - t3; |
| 152 | x3 = s7 + t3; |
| 153 | |
| 154 | // Stage 4 |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 155 | t0 = x0 * cospi_28_64 + x3 * cospi_4_64; |
| 156 | t1 = x1 * cospi_12_64 + x2 * cospi_20_64; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 157 | t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 158 | t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 159 | output[1] = (tran_low_t)fdct_round_shift(t0); |
| 160 | output[3] = (tran_low_t)fdct_round_shift(t2); |
| 161 | output[5] = (tran_low_t)fdct_round_shift(t1); |
| 162 | output[7] = (tran_low_t)fdct_round_shift(t3); |
| 163 | output += 8; |
| 164 | } |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 165 | in = intermediate; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 166 | output = final_output; |
| 167 | } |
| 168 | |
| 169 | // Rows |
| 170 | for (i = 0; i < 8; ++i) { |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 171 | for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 172 | } |
| 173 | } |
| 174 | |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 175 | void aom_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) { |
Jingning Han | d19033f | 2015-07-28 14:42:25 -0700 | [diff] [blame] | 176 | int r, c; |
| 177 | tran_low_t sum = 0; |
| 178 | for (r = 0; r < 8; ++r) |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 179 | for (c = 0; c < 8; ++c) sum += input[r * stride + c]; |
Jingning Han | d19033f | 2015-07-28 14:42:25 -0700 | [diff] [blame] | 180 | |
| 181 | output[0] = sum; |
| 182 | output[1] = 0; |
| 183 | } |
| 184 | |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 185 | void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 186 | // The 2D transform is done with two passes which are actually pretty |
| 187 | // similar. In the first one, we transform the columns and transpose |
| 188 | // the results. In the second one, we transform the rows. To achieve that, |
| 189 | // as the first pass results are transposed, we transpose the columns (that |
| 190 | // is the transposed rows) and transpose the results (so that it goes back |
| 191 | // in normal/row positions). |
| 192 | int pass; |
| 193 | // We need an intermediate buffer between passes. |
| 194 | tran_low_t intermediate[256]; |
Urvang Joshi | 09eea21 | 2016-07-14 11:40:38 -0700 | [diff] [blame] | 195 | const tran_low_t *in_low = NULL; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 196 | tran_low_t *out = intermediate; |
| 197 | // Do the two transform/transpose passes |
| 198 | for (pass = 0; pass < 2; ++pass) { |
| 199 | tran_high_t step1[8]; // canbe16 |
| 200 | tran_high_t step2[8]; // canbe16 |
| 201 | tran_high_t step3[8]; // canbe16 |
Urvang Joshi | 09eea21 | 2016-07-14 11:40:38 -0700 | [diff] [blame] | 202 | tran_high_t in_high[8]; // canbe16 |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 203 | tran_high_t temp1, temp2; // needs32 |
| 204 | int i; |
| 205 | for (i = 0; i < 16; i++) { |
| 206 | if (0 == pass) { |
| 207 | // Calculate input for the first 8 results. |
Urvang Joshi | 09eea21 | 2016-07-14 11:40:38 -0700 | [diff] [blame] | 208 | in_high[0] = (input[0 * stride] + input[15 * stride]) * 4; |
| 209 | in_high[1] = (input[1 * stride] + input[14 * stride]) * 4; |
| 210 | in_high[2] = (input[2 * stride] + input[13 * stride]) * 4; |
| 211 | in_high[3] = (input[3 * stride] + input[12 * stride]) * 4; |
| 212 | in_high[4] = (input[4 * stride] + input[11 * stride]) * 4; |
| 213 | in_high[5] = (input[5 * stride] + input[10 * stride]) * 4; |
| 214 | in_high[6] = (input[6 * stride] + input[9 * stride]) * 4; |
| 215 | in_high[7] = (input[7 * stride] + input[8 * stride]) * 4; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 216 | // Calculate input for the next 8 results. |
Urvang Joshi | 09eea21 | 2016-07-14 11:40:38 -0700 | [diff] [blame] | 217 | step1[0] = (input[7 * stride] - input[8 * stride]) * 4; |
| 218 | step1[1] = (input[6 * stride] - input[9 * stride]) * 4; |
| 219 | step1[2] = (input[5 * stride] - input[10 * stride]) * 4; |
| 220 | step1[3] = (input[4 * stride] - input[11 * stride]) * 4; |
| 221 | step1[4] = (input[3 * stride] - input[12 * stride]) * 4; |
| 222 | step1[5] = (input[2 * stride] - input[13 * stride]) * 4; |
| 223 | step1[6] = (input[1 * stride] - input[14 * stride]) * 4; |
| 224 | step1[7] = (input[0 * stride] - input[15 * stride]) * 4; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 225 | } else { |
| 226 | // Calculate input for the first 8 results. |
Urvang Joshi | 09eea21 | 2016-07-14 11:40:38 -0700 | [diff] [blame] | 227 | assert(in_low != NULL); |
| 228 | in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2); |
| 229 | in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2); |
| 230 | in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2); |
| 231 | in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2); |
| 232 | in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2); |
| 233 | in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2); |
| 234 | in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2); |
| 235 | in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2); |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 236 | // Calculate input for the next 8 results. |
Urvang Joshi | 09eea21 | 2016-07-14 11:40:38 -0700 | [diff] [blame] | 237 | step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2); |
| 238 | step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2); |
| 239 | step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2); |
| 240 | step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2); |
| 241 | step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2); |
| 242 | step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2); |
| 243 | step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2); |
| 244 | step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2); |
| 245 | in_low++; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 246 | } |
| 247 | // Work on the first eight values; fdct8(input, even_results); |
| 248 | { |
| 249 | tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 |
| 250 | tran_high_t t0, t1, t2, t3; // needs32 |
| 251 | tran_high_t x0, x1, x2, x3; // canbe16 |
| 252 | |
| 253 | // stage 1 |
Urvang Joshi | 09eea21 | 2016-07-14 11:40:38 -0700 | [diff] [blame] | 254 | s0 = in_high[0] + in_high[7]; |
| 255 | s1 = in_high[1] + in_high[6]; |
| 256 | s2 = in_high[2] + in_high[5]; |
| 257 | s3 = in_high[3] + in_high[4]; |
| 258 | s4 = in_high[3] - in_high[4]; |
| 259 | s5 = in_high[2] - in_high[5]; |
| 260 | s6 = in_high[1] - in_high[6]; |
| 261 | s7 = in_high[0] - in_high[7]; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 262 | |
| 263 | // fdct4(step, step); |
| 264 | x0 = s0 + s3; |
| 265 | x1 = s1 + s2; |
| 266 | x2 = s1 - s2; |
| 267 | x3 = s0 - s3; |
| 268 | t0 = (x0 + x1) * cospi_16_64; |
| 269 | t1 = (x0 - x1) * cospi_16_64; |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 270 | t2 = x3 * cospi_8_64 + x2 * cospi_24_64; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 271 | t3 = x3 * cospi_24_64 - x2 * cospi_8_64; |
| 272 | out[0] = (tran_low_t)fdct_round_shift(t0); |
| 273 | out[4] = (tran_low_t)fdct_round_shift(t2); |
| 274 | out[8] = (tran_low_t)fdct_round_shift(t1); |
| 275 | out[12] = (tran_low_t)fdct_round_shift(t3); |
| 276 | |
| 277 | // Stage 2 |
| 278 | t0 = (s6 - s5) * cospi_16_64; |
| 279 | t1 = (s6 + s5) * cospi_16_64; |
| 280 | t2 = fdct_round_shift(t0); |
| 281 | t3 = fdct_round_shift(t1); |
| 282 | |
| 283 | // Stage 3 |
| 284 | x0 = s4 + t2; |
| 285 | x1 = s4 - t2; |
| 286 | x2 = s7 - t3; |
| 287 | x3 = s7 + t3; |
| 288 | |
| 289 | // Stage 4 |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 290 | t0 = x0 * cospi_28_64 + x3 * cospi_4_64; |
| 291 | t1 = x1 * cospi_12_64 + x2 * cospi_20_64; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 292 | t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 293 | t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 294 | out[2] = (tran_low_t)fdct_round_shift(t0); |
| 295 | out[6] = (tran_low_t)fdct_round_shift(t2); |
| 296 | out[10] = (tran_low_t)fdct_round_shift(t1); |
| 297 | out[14] = (tran_low_t)fdct_round_shift(t3); |
| 298 | } |
| 299 | // Work on the next eight values; step1 -> odd_results |
| 300 | { |
| 301 | // step 2 |
| 302 | temp1 = (step1[5] - step1[2]) * cospi_16_64; |
| 303 | temp2 = (step1[4] - step1[3]) * cospi_16_64; |
| 304 | step2[2] = fdct_round_shift(temp1); |
| 305 | step2[3] = fdct_round_shift(temp2); |
| 306 | temp1 = (step1[4] + step1[3]) * cospi_16_64; |
| 307 | temp2 = (step1[5] + step1[2]) * cospi_16_64; |
| 308 | step2[4] = fdct_round_shift(temp1); |
| 309 | step2[5] = fdct_round_shift(temp2); |
| 310 | // step 3 |
| 311 | step3[0] = step1[0] + step2[3]; |
| 312 | step3[1] = step1[1] + step2[2]; |
| 313 | step3[2] = step1[1] - step2[2]; |
| 314 | step3[3] = step1[0] - step2[3]; |
| 315 | step3[4] = step1[7] - step2[4]; |
| 316 | step3[5] = step1[6] - step2[5]; |
| 317 | step3[6] = step1[6] + step2[5]; |
| 318 | step3[7] = step1[7] + step2[4]; |
| 319 | // step 4 |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 320 | temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; |
| 321 | temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 322 | step2[1] = fdct_round_shift(temp1); |
| 323 | step2[2] = fdct_round_shift(temp2); |
| 324 | temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64; |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 325 | temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 326 | step2[5] = fdct_round_shift(temp1); |
| 327 | step2[6] = fdct_round_shift(temp2); |
| 328 | // step 5 |
| 329 | step1[0] = step3[0] + step2[1]; |
| 330 | step1[1] = step3[0] - step2[1]; |
| 331 | step1[2] = step3[3] + step2[2]; |
| 332 | step1[3] = step3[3] - step2[2]; |
| 333 | step1[4] = step3[4] - step2[5]; |
| 334 | step1[5] = step3[4] + step2[5]; |
| 335 | step1[6] = step3[7] - step2[6]; |
| 336 | step1[7] = step3[7] + step2[6]; |
| 337 | // step 6 |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 338 | temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 339 | temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; |
| 340 | out[1] = (tran_low_t)fdct_round_shift(temp1); |
| 341 | out[9] = (tran_low_t)fdct_round_shift(temp2); |
| 342 | temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 343 | temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 344 | out[5] = (tran_low_t)fdct_round_shift(temp1); |
| 345 | out[13] = (tran_low_t)fdct_round_shift(temp2); |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 346 | temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 347 | temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; |
| 348 | out[3] = (tran_low_t)fdct_round_shift(temp1); |
| 349 | out[11] = (tran_low_t)fdct_round_shift(temp2); |
| 350 | temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 351 | temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 352 | out[7] = (tran_low_t)fdct_round_shift(temp1); |
| 353 | out[15] = (tran_low_t)fdct_round_shift(temp2); |
| 354 | } |
| 355 | // Do next column (which is a transposed row in second/horizontal pass) |
Urvang Joshi | 09eea21 | 2016-07-14 11:40:38 -0700 | [diff] [blame] | 356 | input++; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 357 | out += 16; |
| 358 | } |
| 359 | // Setup in/out for next pass. |
Urvang Joshi | 09eea21 | 2016-07-14 11:40:38 -0700 | [diff] [blame] | 360 | in_low = intermediate; |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 361 | out = output; |
| 362 | } |
| 363 | } |
| 364 | |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 365 | void aom_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) { |
Jingning Han | d19033f | 2015-07-28 14:42:25 -0700 | [diff] [blame] | 366 | int r, c; |
| 367 | tran_low_t sum = 0; |
| 368 | for (r = 0; r < 16; ++r) |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 369 | for (c = 0; c < 16; ++c) sum += input[r * stride + c]; |
Jingning Han | d19033f | 2015-07-28 14:42:25 -0700 | [diff] [blame] | 370 | |
| 371 | output[0] = sum >> 1; |
| 372 | output[1] = 0; |
| 373 | } |
| 374 | |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 375 | static INLINE tran_high_t dct_32_round(tran_high_t input) { |
| 376 | tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); |
| 377 | // TODO(debargha, peter.derivaz): Find new bounds for this assert, |
| 378 | // and make the bounds consts. |
| 379 | // assert(-131072 <= rv && rv <= 131071); |
| 380 | return rv; |
| 381 | } |
| 382 | |
| 383 | static INLINE tran_high_t half_round_shift(tran_high_t input) { |
| 384 | tran_high_t rv = (input + 1 + (input < 0)) >> 2; |
| 385 | return rv; |
| 386 | } |
| 387 | |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 388 | void aom_fdct32(const tran_high_t *input, tran_high_t *output, int round) { |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 389 | tran_high_t step[32]; |
| 390 | // Stage 1 |
| 391 | step[0] = input[0] + input[(32 - 1)]; |
| 392 | step[1] = input[1] + input[(32 - 2)]; |
| 393 | step[2] = input[2] + input[(32 - 3)]; |
| 394 | step[3] = input[3] + input[(32 - 4)]; |
| 395 | step[4] = input[4] + input[(32 - 5)]; |
| 396 | step[5] = input[5] + input[(32 - 6)]; |
| 397 | step[6] = input[6] + input[(32 - 7)]; |
| 398 | step[7] = input[7] + input[(32 - 8)]; |
| 399 | step[8] = input[8] + input[(32 - 9)]; |
| 400 | step[9] = input[9] + input[(32 - 10)]; |
| 401 | step[10] = input[10] + input[(32 - 11)]; |
| 402 | step[11] = input[11] + input[(32 - 12)]; |
| 403 | step[12] = input[12] + input[(32 - 13)]; |
| 404 | step[13] = input[13] + input[(32 - 14)]; |
| 405 | step[14] = input[14] + input[(32 - 15)]; |
| 406 | step[15] = input[15] + input[(32 - 16)]; |
| 407 | step[16] = -input[16] + input[(32 - 17)]; |
| 408 | step[17] = -input[17] + input[(32 - 18)]; |
| 409 | step[18] = -input[18] + input[(32 - 19)]; |
| 410 | step[19] = -input[19] + input[(32 - 20)]; |
| 411 | step[20] = -input[20] + input[(32 - 21)]; |
| 412 | step[21] = -input[21] + input[(32 - 22)]; |
| 413 | step[22] = -input[22] + input[(32 - 23)]; |
| 414 | step[23] = -input[23] + input[(32 - 24)]; |
| 415 | step[24] = -input[24] + input[(32 - 25)]; |
| 416 | step[25] = -input[25] + input[(32 - 26)]; |
| 417 | step[26] = -input[26] + input[(32 - 27)]; |
| 418 | step[27] = -input[27] + input[(32 - 28)]; |
| 419 | step[28] = -input[28] + input[(32 - 29)]; |
| 420 | step[29] = -input[29] + input[(32 - 30)]; |
| 421 | step[30] = -input[30] + input[(32 - 31)]; |
| 422 | step[31] = -input[31] + input[(32 - 32)]; |
| 423 | |
| 424 | // Stage 2 |
| 425 | output[0] = step[0] + step[16 - 1]; |
| 426 | output[1] = step[1] + step[16 - 2]; |
| 427 | output[2] = step[2] + step[16 - 3]; |
| 428 | output[3] = step[3] + step[16 - 4]; |
| 429 | output[4] = step[4] + step[16 - 5]; |
| 430 | output[5] = step[5] + step[16 - 6]; |
| 431 | output[6] = step[6] + step[16 - 7]; |
| 432 | output[7] = step[7] + step[16 - 8]; |
| 433 | output[8] = -step[8] + step[16 - 9]; |
| 434 | output[9] = -step[9] + step[16 - 10]; |
| 435 | output[10] = -step[10] + step[16 - 11]; |
| 436 | output[11] = -step[11] + step[16 - 12]; |
| 437 | output[12] = -step[12] + step[16 - 13]; |
| 438 | output[13] = -step[13] + step[16 - 14]; |
| 439 | output[14] = -step[14] + step[16 - 15]; |
| 440 | output[15] = -step[15] + step[16 - 16]; |
| 441 | |
| 442 | output[16] = step[16]; |
| 443 | output[17] = step[17]; |
| 444 | output[18] = step[18]; |
| 445 | output[19] = step[19]; |
| 446 | |
| 447 | output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64); |
| 448 | output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64); |
| 449 | output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64); |
| 450 | output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64); |
| 451 | |
| 452 | output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64); |
| 453 | output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64); |
| 454 | output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64); |
| 455 | output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64); |
| 456 | |
| 457 | output[28] = step[28]; |
| 458 | output[29] = step[29]; |
| 459 | output[30] = step[30]; |
| 460 | output[31] = step[31]; |
| 461 | |
| 462 | // dump the magnitude by 4, hence the intermediate values are within |
| 463 | // the range of 16 bits. |
| 464 | if (round) { |
| 465 | output[0] = half_round_shift(output[0]); |
| 466 | output[1] = half_round_shift(output[1]); |
| 467 | output[2] = half_round_shift(output[2]); |
| 468 | output[3] = half_round_shift(output[3]); |
| 469 | output[4] = half_round_shift(output[4]); |
| 470 | output[5] = half_round_shift(output[5]); |
| 471 | output[6] = half_round_shift(output[6]); |
| 472 | output[7] = half_round_shift(output[7]); |
| 473 | output[8] = half_round_shift(output[8]); |
| 474 | output[9] = half_round_shift(output[9]); |
| 475 | output[10] = half_round_shift(output[10]); |
| 476 | output[11] = half_round_shift(output[11]); |
| 477 | output[12] = half_round_shift(output[12]); |
| 478 | output[13] = half_round_shift(output[13]); |
| 479 | output[14] = half_round_shift(output[14]); |
| 480 | output[15] = half_round_shift(output[15]); |
| 481 | |
| 482 | output[16] = half_round_shift(output[16]); |
| 483 | output[17] = half_round_shift(output[17]); |
| 484 | output[18] = half_round_shift(output[18]); |
| 485 | output[19] = half_round_shift(output[19]); |
| 486 | output[20] = half_round_shift(output[20]); |
| 487 | output[21] = half_round_shift(output[21]); |
| 488 | output[22] = half_round_shift(output[22]); |
| 489 | output[23] = half_round_shift(output[23]); |
| 490 | output[24] = half_round_shift(output[24]); |
| 491 | output[25] = half_round_shift(output[25]); |
| 492 | output[26] = half_round_shift(output[26]); |
| 493 | output[27] = half_round_shift(output[27]); |
| 494 | output[28] = half_round_shift(output[28]); |
| 495 | output[29] = half_round_shift(output[29]); |
| 496 | output[30] = half_round_shift(output[30]); |
| 497 | output[31] = half_round_shift(output[31]); |
| 498 | } |
| 499 | |
| 500 | // Stage 3 |
| 501 | step[0] = output[0] + output[(8 - 1)]; |
| 502 | step[1] = output[1] + output[(8 - 2)]; |
| 503 | step[2] = output[2] + output[(8 - 3)]; |
| 504 | step[3] = output[3] + output[(8 - 4)]; |
| 505 | step[4] = -output[4] + output[(8 - 5)]; |
| 506 | step[5] = -output[5] + output[(8 - 6)]; |
| 507 | step[6] = -output[6] + output[(8 - 7)]; |
| 508 | step[7] = -output[7] + output[(8 - 8)]; |
| 509 | step[8] = output[8]; |
| 510 | step[9] = output[9]; |
| 511 | step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64); |
| 512 | step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64); |
| 513 | step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64); |
| 514 | step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64); |
| 515 | step[14] = output[14]; |
| 516 | step[15] = output[15]; |
| 517 | |
| 518 | step[16] = output[16] + output[23]; |
| 519 | step[17] = output[17] + output[22]; |
| 520 | step[18] = output[18] + output[21]; |
| 521 | step[19] = output[19] + output[20]; |
| 522 | step[20] = -output[20] + output[19]; |
| 523 | step[21] = -output[21] + output[18]; |
| 524 | step[22] = -output[22] + output[17]; |
| 525 | step[23] = -output[23] + output[16]; |
| 526 | step[24] = -output[24] + output[31]; |
| 527 | step[25] = -output[25] + output[30]; |
| 528 | step[26] = -output[26] + output[29]; |
| 529 | step[27] = -output[27] + output[28]; |
| 530 | step[28] = output[28] + output[27]; |
| 531 | step[29] = output[29] + output[26]; |
| 532 | step[30] = output[30] + output[25]; |
| 533 | step[31] = output[31] + output[24]; |
| 534 | |
| 535 | // Stage 4 |
| 536 | output[0] = step[0] + step[3]; |
| 537 | output[1] = step[1] + step[2]; |
| 538 | output[2] = -step[2] + step[1]; |
| 539 | output[3] = -step[3] + step[0]; |
| 540 | output[4] = step[4]; |
| 541 | output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64); |
| 542 | output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64); |
| 543 | output[7] = step[7]; |
| 544 | output[8] = step[8] + step[11]; |
| 545 | output[9] = step[9] + step[10]; |
| 546 | output[10] = -step[10] + step[9]; |
| 547 | output[11] = -step[11] + step[8]; |
| 548 | output[12] = -step[12] + step[15]; |
| 549 | output[13] = -step[13] + step[14]; |
| 550 | output[14] = step[14] + step[13]; |
| 551 | output[15] = step[15] + step[12]; |
| 552 | |
| 553 | output[16] = step[16]; |
| 554 | output[17] = step[17]; |
| 555 | output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64); |
| 556 | output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64); |
| 557 | output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64); |
| 558 | output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64); |
| 559 | output[22] = step[22]; |
| 560 | output[23] = step[23]; |
| 561 | output[24] = step[24]; |
| 562 | output[25] = step[25]; |
| 563 | output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64); |
| 564 | output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64); |
| 565 | output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64); |
| 566 | output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64); |
| 567 | output[30] = step[30]; |
| 568 | output[31] = step[31]; |
| 569 | |
| 570 | // Stage 5 |
| 571 | step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64); |
| 572 | step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64); |
| 573 | step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64); |
| 574 | step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64); |
| 575 | step[4] = output[4] + output[5]; |
| 576 | step[5] = -output[5] + output[4]; |
| 577 | step[6] = -output[6] + output[7]; |
| 578 | step[7] = output[7] + output[6]; |
| 579 | step[8] = output[8]; |
| 580 | step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64); |
| 581 | step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64); |
| 582 | step[11] = output[11]; |
| 583 | step[12] = output[12]; |
| 584 | step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64); |
| 585 | step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64); |
| 586 | step[15] = output[15]; |
| 587 | |
| 588 | step[16] = output[16] + output[19]; |
| 589 | step[17] = output[17] + output[18]; |
| 590 | step[18] = -output[18] + output[17]; |
| 591 | step[19] = -output[19] + output[16]; |
| 592 | step[20] = -output[20] + output[23]; |
| 593 | step[21] = -output[21] + output[22]; |
| 594 | step[22] = output[22] + output[21]; |
| 595 | step[23] = output[23] + output[20]; |
| 596 | step[24] = output[24] + output[27]; |
| 597 | step[25] = output[25] + output[26]; |
| 598 | step[26] = -output[26] + output[25]; |
| 599 | step[27] = -output[27] + output[24]; |
| 600 | step[28] = -output[28] + output[31]; |
| 601 | step[29] = -output[29] + output[30]; |
| 602 | step[30] = output[30] + output[29]; |
| 603 | step[31] = output[31] + output[28]; |
| 604 | |
| 605 | // Stage 6 |
| 606 | output[0] = step[0]; |
| 607 | output[1] = step[1]; |
| 608 | output[2] = step[2]; |
| 609 | output[3] = step[3]; |
| 610 | output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64); |
| 611 | output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64); |
| 612 | output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64); |
| 613 | output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64); |
| 614 | output[8] = step[8] + step[9]; |
| 615 | output[9] = -step[9] + step[8]; |
| 616 | output[10] = -step[10] + step[11]; |
| 617 | output[11] = step[11] + step[10]; |
| 618 | output[12] = step[12] + step[13]; |
| 619 | output[13] = -step[13] + step[12]; |
| 620 | output[14] = -step[14] + step[15]; |
| 621 | output[15] = step[15] + step[14]; |
| 622 | |
| 623 | output[16] = step[16]; |
| 624 | output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64); |
| 625 | output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64); |
| 626 | output[19] = step[19]; |
| 627 | output[20] = step[20]; |
| 628 | output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64); |
| 629 | output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64); |
| 630 | output[23] = step[23]; |
| 631 | output[24] = step[24]; |
| 632 | output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64); |
| 633 | output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64); |
| 634 | output[27] = step[27]; |
| 635 | output[28] = step[28]; |
| 636 | output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64); |
| 637 | output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64); |
| 638 | output[31] = step[31]; |
| 639 | |
| 640 | // Stage 7 |
| 641 | step[0] = output[0]; |
| 642 | step[1] = output[1]; |
| 643 | step[2] = output[2]; |
| 644 | step[3] = output[3]; |
| 645 | step[4] = output[4]; |
| 646 | step[5] = output[5]; |
| 647 | step[6] = output[6]; |
| 648 | step[7] = output[7]; |
| 649 | step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64); |
| 650 | step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64); |
| 651 | step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64); |
| 652 | step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64); |
| 653 | step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64); |
| 654 | step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64); |
| 655 | step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64); |
| 656 | step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64); |
| 657 | |
| 658 | step[16] = output[16] + output[17]; |
| 659 | step[17] = -output[17] + output[16]; |
| 660 | step[18] = -output[18] + output[19]; |
| 661 | step[19] = output[19] + output[18]; |
| 662 | step[20] = output[20] + output[21]; |
| 663 | step[21] = -output[21] + output[20]; |
| 664 | step[22] = -output[22] + output[23]; |
| 665 | step[23] = output[23] + output[22]; |
| 666 | step[24] = output[24] + output[25]; |
| 667 | step[25] = -output[25] + output[24]; |
| 668 | step[26] = -output[26] + output[27]; |
| 669 | step[27] = output[27] + output[26]; |
| 670 | step[28] = output[28] + output[29]; |
| 671 | step[29] = -output[29] + output[28]; |
| 672 | step[30] = -output[30] + output[31]; |
| 673 | step[31] = output[31] + output[30]; |
| 674 | |
| 675 | // Final stage --- outputs indices are bit-reversed. |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 676 | output[0] = step[0]; |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 677 | output[16] = step[1]; |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 678 | output[8] = step[2]; |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 679 | output[24] = step[3]; |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 680 | output[4] = step[4]; |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 681 | output[20] = step[5]; |
| 682 | output[12] = step[6]; |
| 683 | output[28] = step[7]; |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 684 | output[2] = step[8]; |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 685 | output[18] = step[9]; |
| 686 | output[10] = step[10]; |
| 687 | output[26] = step[11]; |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 688 | output[6] = step[12]; |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 689 | output[22] = step[13]; |
| 690 | output[14] = step[14]; |
| 691 | output[30] = step[15]; |
| 692 | |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 693 | output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64); |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 694 | output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64); |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 695 | output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64); |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 696 | output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64); |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 697 | output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64); |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 698 | output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64); |
| 699 | output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64); |
| 700 | output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64); |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 701 | output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 702 | output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); |
| 703 | output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); |
| 704 | output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 705 | output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 706 | output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); |
| 707 | output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); |
| 708 | output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); |
| 709 | } |
| 710 | |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 711 | void aom_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 712 | int i, j; |
| 713 | tran_high_t output[32 * 32]; |
| 714 | |
| 715 | // Columns |
| 716 | for (i = 0; i < 32; ++i) { |
| 717 | tran_high_t temp_in[32], temp_out[32]; |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 718 | for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 719 | aom_fdct32(temp_in, temp_out, 0); |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 720 | for (j = 0; j < 32; ++j) |
| 721 | output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; |
| 722 | } |
| 723 | |
| 724 | // Rows |
| 725 | for (i = 0; i < 32; ++i) { |
| 726 | tran_high_t temp_in[32], temp_out[32]; |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 727 | for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 728 | aom_fdct32(temp_in, temp_out, 0); |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 729 | for (j = 0; j < 32; ++j) |
| 730 | out[j + i * 32] = |
| 731 | (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); |
| 732 | } |
| 733 | } |
| 734 | |
| 735 | // Note that although we use dct_32_round in dct32 computation flow, |
| 736 | // this 2d fdct32x32 for rate-distortion optimization loop is operating |
| 737 | // within 16 bits precision. |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 738 | void aom_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 739 | int i, j; |
| 740 | tran_high_t output[32 * 32]; |
| 741 | |
| 742 | // Columns |
| 743 | for (i = 0; i < 32; ++i) { |
| 744 | tran_high_t temp_in[32], temp_out[32]; |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 745 | for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 746 | aom_fdct32(temp_in, temp_out, 0); |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 747 | for (j = 0; j < 32; ++j) |
| 748 | // TODO(cd): see quality impact of only doing |
| 749 | // output[j * 32 + i] = (temp_out[j] + 1) >> 2; |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 750 | // PS: also change code in aom_dsp/x86/aom_dct_sse2.c |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 751 | output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; |
| 752 | } |
| 753 | |
| 754 | // Rows |
| 755 | for (i = 0; i < 32; ++i) { |
| 756 | tran_high_t temp_in[32], temp_out[32]; |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 757 | for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 758 | aom_fdct32(temp_in, temp_out, 1); |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 759 | for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j]; |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 760 | } |
| 761 | } |
| 762 | |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 763 | void aom_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { |
Jingning Han | d19033f | 2015-07-28 14:42:25 -0700 | [diff] [blame] | 764 | int r, c; |
| 765 | tran_low_t sum = 0; |
| 766 | for (r = 0; r < 32; ++r) |
clang-format | 99e28b8 | 2016-01-27 12:42:45 -0800 | [diff] [blame] | 767 | for (c = 0; c < 32; ++c) sum += input[r * stride + c]; |
Jingning Han | d19033f | 2015-07-28 14:42:25 -0700 | [diff] [blame] | 768 | |
| 769 | output[0] = sum >> 3; |
| 770 | output[1] = 0; |
| 771 | } |
| 772 | |
Yaowu Xu | 01dee0b | 2016-03-25 12:43:01 -0700 | [diff] [blame] | 773 | #if CONFIG_AOM_HIGHBITDEPTH |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 774 | void aom_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 775 | int stride) { |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 776 | aom_fdct4x4_c(input, output, stride); |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 777 | } |
| 778 | |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 779 | void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output, |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 780 | int stride) { |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 781 | aom_fdct8x8_c(input, final_output, stride); |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 782 | } |
| 783 | |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 784 | void aom_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output, |
Jingning Han | d19033f | 2015-07-28 14:42:25 -0700 | [diff] [blame] | 785 | int stride) { |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 786 | aom_fdct8x8_1_c(input, final_output, stride); |
Jingning Han | d19033f | 2015-07-28 14:42:25 -0700 | [diff] [blame] | 787 | } |
| 788 | |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 789 | void aom_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 790 | int stride) { |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 791 | aom_fdct16x16_c(input, output, stride); |
Jingning Han | b67821f | 2015-07-21 11:56:36 -0700 | [diff] [blame] | 792 | } |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 793 | |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 794 | void aom_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, |
Jingning Han | d19033f | 2015-07-28 14:42:25 -0700 | [diff] [blame] | 795 | int stride) { |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 796 | aom_fdct16x16_1_c(input, output, stride); |
Jingning Han | d19033f | 2015-07-28 14:42:25 -0700 | [diff] [blame] | 797 | } |
| 798 | |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 799 | void aom_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { |
| 800 | aom_fdct32x32_c(input, out, stride); |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 801 | } |
| 802 | |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 803 | void aom_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 804 | int stride) { |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 805 | aom_fdct32x32_rd_c(input, out, stride); |
Jingning Han | a6a4659 | 2015-07-27 16:05:15 -0700 | [diff] [blame] | 806 | } |
Jingning Han | d19033f | 2015-07-28 14:42:25 -0700 | [diff] [blame] | 807 | |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 808 | void aom_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out, |
Jingning Han | d19033f | 2015-07-28 14:42:25 -0700 | [diff] [blame] | 809 | int stride) { |
Adrian Grange | cebe6f0 | 2016-03-25 12:11:05 -0700 | [diff] [blame] | 810 | aom_fdct32x32_1_c(input, out, stride); |
Jingning Han | d19033f | 2015-07-28 14:42:25 -0700 | [diff] [blame] | 811 | } |
Yaowu Xu | 01dee0b | 2016-03-25 12:43:01 -0700 | [diff] [blame] | 812 | #endif // CONFIG_AOM_HIGHBITDEPTH |